#!/usr/bin/env bash # ───────────────────────────────────────────────────────────────────────────── # larry-anywhere v0.8.2: PHI Presidio client # # Bash wrapper around the Presidio sidecar's /redact endpoint. Sourced from # larry.sh's auto-PHI pipeline as the tier-5 free-text NER pass. # # Functions (sourced): # phi_client_available — 0 if sidecar reachable; 1 otherwise # phi_redact_text TEXT — echo redacted form on stdout; non-zero on failure # (in which case caller leaves TEXT unchanged — # "fail-open" is the right call for tier-5 alone) # Standalone: # ./phi-client.sh check — health probe # ./phi-client.sh redact "the patient ..." — one-shot redact # # Wire-up in larry.sh:auto_detect_phi: # - After tier-1/2/3/4 produce hits and tokenize, BEFORE add_user_text, # call phi_redact_text on the (already-partially-tokenized) input. # - For each entity returned with score > threshold, tokenize via # hl7-sanitize.sh's tokenize-value (category = presidio_) # to maintain stable token IDs across surfaces. # ───────────────────────────────────────────────────────────────────────────── LARRY_PHI_PORT="${LARRY_PHI_PORT:-41189}" LARRY_PHI_HOST="${LARRY_PHI_HOST:-127.0.0.1}" LARRY_PHI_TIMEOUT="${LARRY_PHI_TIMEOUT:-5}" # seconds — bounds tier-5 stall # Defense against CR-tainted env (Cygwin v0.7.5 lesson). _phi_client_dir="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" 2>/dev/null && pwd)" if [ -f "$_phi_client_dir/cygwin-safe.sh" ]; then # shellcheck source=cygwin-safe.sh . "$_phi_client_dir/cygwin-safe.sh" 2>/dev/null || true fi if declare -F coerce_int >/dev/null 2>&1; then LARRY_PHI_PORT=$(coerce_int "$LARRY_PHI_PORT" 41189) LARRY_PHI_TIMEOUT=$(coerce_int "$LARRY_PHI_TIMEOUT" 5) fi phi_client_available() { curl -fsS -m 1 "http://${LARRY_PHI_HOST}:${LARRY_PHI_PORT}/health" >/dev/null 2>&1 } # phi_redact_text TEXT → emits redacted TEXT on stdout, non-zero on any failure. # JSON-quoting handled via jq so payload is safe for any control chars. phi_redact_text() { local text="$1" [ -z "$text" ] && { printf ''; return 0; } # Build JSON payload via jq -n --arg (handles all escaping correctly). local payload payload=$(jq -nc --arg t "$text" '{text:$t}') || return 2 local resp resp=$(curl -fsS -m "$LARRY_PHI_TIMEOUT" \ -X POST -H 'Content-Type: application/json' \ --data-binary "$payload" \ "http://${LARRY_PHI_HOST}:${LARRY_PHI_PORT}/redact" 2>/dev/null) || return 3 # Parse out the redacted text. Empty → upstream error. local redacted redacted=$(printf '%s' "$resp" | jq -r '.redacted // empty' 2>/dev/null) || return 4 [ -z "$redacted" ] && return 5 printf '%s' "$redacted" return 0 } # Emit the entities (one per line: TYPE\tSTART\tEND\tSCORE\tVALUE) so the # caller can re-tokenize with hl7-sanitize.sh's tokenize-value pipeline # (categories: presidio_PERSON, presidio_LOCATION, etc.) for stable IDs. phi_redact_entities() { local text="$1" [ -z "$text" ] && return 0 local payload resp payload=$(jq -nc --arg t "$text" '{text:$t}') || return 2 resp=$(curl -fsS -m "$LARRY_PHI_TIMEOUT" \ -X POST -H 'Content-Type: application/json' \ --data-binary "$payload" \ "http://${LARRY_PHI_HOST}:${LARRY_PHI_PORT}/redact" 2>/dev/null) || return 3 printf '%s' "$resp" | jq -r ' .entities[]? | [.type, (.start|tostring), (.end|tostring), (.score|tostring), ($input[(.start|tonumber):(.end|tonumber)])] | @tsv ' --argjson input "$(jq -nc --arg t "$text" '$t')" 2>/dev/null } # Standalone CLI mode (when run, not sourced). if [ "${BASH_SOURCE[0]:-$0}" = "$0" ]; then case "${1:-}" in check) if phi_client_available; then echo "phi-client: sidecar reachable"; exit 0 else echo "phi-client: sidecar unreachable on $LARRY_PHI_HOST:$LARRY_PHI_PORT" >&2; exit 1; fi ;; redact) shift [ -z "${1:-}" ] && { echo "usage: phi-client.sh redact " >&2; exit 2; } phi_redact_text "$1"; echo ;; entities) shift [ -z "${1:-}" ] && { echo "usage: phi-client.sh entities " >&2; exit 2; } phi_redact_entities "$1" ;; *) cat < emit redacted text entities emit entities (TYPE TAB START END SCORE VALUE) Functions (when sourced): phi_client_available phi_redact_text phi_redact_entities Env: LARRY_PHI_PORT (41189), LARRY_PHI_HOST (127.0.0.1), LARRY_PHI_TIMEOUT (5). USAGE ;; esac fi