diff --git a/VERSION b/VERSION index 7d85683..d1d899f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.5.4 +0.5.5 diff --git a/larry.sh b/larry.sh index c7a9226..8a1cac6 100755 --- a/larry.sh +++ b/larry.sh @@ -36,7 +36,7 @@ set -o pipefail # ───────────────────────────────────────────────────────────────────────────── # Config # ───────────────────────────────────────────────────────────────────────────── -LARRY_VERSION="0.5.4" +LARRY_VERSION="0.5.5" LARRY_HOME="${LARRY_HOME:-$HOME/.larry}" LARRY_BASE_URL="${LARRY_BASE_URL:-https://raw.githubusercontent.com/bojj27/cloverleaf-larry/main}" LARRY_UPDATE_URL="${LARRY_UPDATE_URL:-${LARRY_BASE_URL}/larry.sh}" @@ -740,23 +740,15 @@ preprocess_phi_markers() { local sanitize_script="$LARRY_LIB_DIR/hl7-sanitize.sh" [ -x "$sanitize_script" ] || { printf '%s' "$input"; return; } - # Use grep -oE to extract markers reliably across bash versions. - local markers - markers=$(printf '%s' "$input" | grep -oE '\{\{phi:[^{}]+\}\}' 2>/dev/null | sort -u) - [ -z "$markers" ] && { printf '%s' "$input"; return; } + # Three forms supported (processed in this order to avoid ambiguity): + # 1. @@VALUE@@ bracketed; VALUE has no '@' and uses single-space word + # separation. Use for values WITH spaces. Auto-detect category. + # 2. @@VALUE unbracketed; VALUE has no whitespace or '@'. Auto-detect. + # 3. {{phi:V}} / {{phi:CAT:V}} legacy, still supported. - while IFS= read -r marker; do - [ -z "$marker" ] && continue - # Strip {{phi: prefix and }} suffix - local body="${marker#\{\{phi:}" - body="${body%\}\}}" - local category="" value="" - if [[ "$body" == *:* ]] && [[ "${body%%:*}" =~ ^[A-Z][A-Z0-9_]+$ ]]; then - category="${body%%:*}" - value="${body#*:}" - else - value="$body" - fi + # Helper: tokenize one VALUE (optional category) and substitute MARKER → token. + _phi_sub() { + local marker="$1" value="$2" category="${3:-}" local args=(tokenize-value) [ -n "$category" ] && args+=(--category "$category") args+=("$value") @@ -764,7 +756,45 @@ preprocess_phi_markers() { [ -z "$token" ] && token="[[PHI_ERROR]]" input="${input//"$marker"/"$token"}" printf '%sphi>%s %s → %s\n' "$C_YELLOW" "$C_RESET" "$marker" "$token" >&2 - done <<< "$markers" + } + + # Pass 1: bracketed @@VALUE@@ — value has no '@', allows internal single spaces + # but no leading/trailing whitespace inside the brackets. + local bracketed + bracketed=$(printf '%s' "$input" | grep -oE '@@[^@[:space:]]+([ \t]+[^@[:space:]]+)*@@' 2>/dev/null | sort -u) + while IFS= read -r marker; do + [ -z "$marker" ] && continue + local val="${marker#@@}"; val="${val%@@}" + _phi_sub "$marker" "$val" + done <<< "$bracketed" + + # Pass 2: unbracketed @@VALUE — value has no whitespace or '@'. Anything that + # was inside a bracketed marker has already been replaced with a [[TOK]], so + # it won't be re-matched here. + local unbracketed + unbracketed=$(printf '%s' "$input" | grep -oE '@@[^@[:space:]]+' 2>/dev/null | sort -u) + while IFS= read -r marker; do + [ -z "$marker" ] && continue + local val="${marker#@@}" + _phi_sub "$marker" "$val" + done <<< "$unbracketed" + + # Pass 3: legacy {{phi:VALUE}} / {{phi:CATEGORY:VALUE}}. + local legacy + legacy=$(printf '%s' "$input" | grep -oE '\{\{phi:[^{}]+\}\}' 2>/dev/null | sort -u) + while IFS= read -r marker; do + [ -z "$marker" ] && continue + local body="${marker#\{\{phi:}"; body="${body%\}\}}" + local cat="" val="" + if [[ "$body" == *:* ]] && [[ "${body%%:*}" =~ ^[A-Z][A-Z0-9_]+$ ]]; then + cat="${body%%:*}"; val="${body#*:}" + else + val="$body" + fi + _phi_sub "$marker" "$val" "$cat" + done <<< "$legacy" + + unset -f _phi_sub printf '%s' "$input" } @@ -1067,9 +1097,14 @@ Slash commands: /tokens show the full local PHI ↔ token lookup table PHI inline syntax in any prompt: - {{phi:VALUE}} tokenize before send; auto-detects category - {{phi:MRN:12345}} explicit category=MRN (matches sanitized data) - {{phi:NAME:JOHN SMITH}} explicit category=NAME + @@VALUE EASY: wrap PHI in @@. Spaceless = no end delim. + e.g. @@12345 @@SMITH^JOHN @@V789 + @@VALUE@@ Use when VALUE has spaces. + e.g. @@John Smith@@ @@Smith, John@@ + Name canonicalization: SMITH^JOHN, Smith, John, John Smith, JOHN SMITH + all collapse to the same token. + Category is auto-detected from value shape (MRN/SSN/DOB/NAME/MANUAL). + {{phi:VALUE}} / {{phi:CAT:VALUE}} legacy syntax (still works) /redetect re-scan for HCIROOT/HCISITE/tools /sites list site dirs under HCIROOT /site switch HCISITE for this session @@ -1192,7 +1227,7 @@ main_loop() { # PHI preprocessing: replace any {{phi:VALUE}} markers with local tokens # BEFORE the input enters conversation history and gets sent to Anthropic. - if [[ "$input" == *"{{phi:"* ]]; then + if [[ "$input" == *"{{phi:"* ]] || [[ "$input" == *"@@"* ]]; then input=$(preprocess_phi_markers "$input") fi diff --git a/lib/hl7-sanitize.sh b/lib/hl7-sanitize.sh index 94815ff..347b095 100755 --- a/lib/hl7-sanitize.sh +++ b/lib/hl7-sanitize.sh @@ -130,11 +130,89 @@ cmd_count() { echo $(($(wc -l < "$table") - 1)) } +# ───────────────────────────────────────────────────────────────────────────── +# Auto-category detection + value canonicalization. +# Canonicalization makes "SMITH^JOHN", "Smith, John", "John Smith", and +# "JOHN SMITH" all collapse to the same lookup key so they share one token. +# ───────────────────────────────────────────────────────────────────────────── + +# detect_category VALUE — guess category from value shape. +# Returns one of: MRN | SSN | DOB | NAME | MANUAL +detect_category() { + local v="$1" + # Trim surrounding whitespace + v="${v#"${v%%[![:space:]]*}"}" + v="${v%"${v##*[![:space:]]}"}" + + # Pure digits, 4-15 chars → MRN + if [[ "$v" =~ ^[0-9]+$ ]] && [ "${#v}" -ge 4 ] && [ "${#v}" -le 15 ]; then + printf 'MRN'; return + fi + # SSN: 9 digits with optional dashes + if [[ "$v" =~ ^[0-9]{3}-?[0-9]{2}-?[0-9]{4}$ ]]; then + printf 'SSN'; return + fi + # Date-like + if [[ "$v" =~ ^[0-9]{1,4}[/-][0-9]{1,2}[/-][0-9]{1,4}$ ]]; then + printf 'DOB'; return + fi + # HL7 caret or comma → likely name + if [[ "$v" == *"^"* ]] || [[ "$v" == *","* ]]; then + printf 'NAME'; return + fi + # Two+ alpha tokens → name + local word_count + word_count=$(printf '%s\n' "$v" | awk '{ c=0; for(i=1;i<=NF;i++) if ($i ~ /^[A-Za-z]+$/) c++; print c }') + if [ "${word_count:-0}" -ge 2 ]; then + printf 'NAME'; return + fi + printf 'MANUAL' +} + +# normalize_value VALUE CATEGORY — produce canonical lookup key. +# NAME: lowercase, strip punctuation, sort unique alpha tokens. So +# "SMITH^JOHN", "Smith, John", "John Smith", "JOHN SMITH" → "john smith" +# DOB: parse to YYYY-MM-DD if possible (needs GNU date or BSD date) +# SSN: strip dashes/whitespace +# MRN/MANUAL: just trim outer whitespace +normalize_value() { + local value="$1" + local category="$2" + case "$category" in + NAME) + printf '%s' "$value" \ + | tr '[:upper:]' '[:lower:]' \ + | tr ',^/' ' ' \ + | tr -cs '[:alnum:]' ' ' \ + | tr ' ' '\n' \ + | grep -E '^[a-z][a-z0-9]*$' \ + | sort -u \ + | tr '\n' ' ' \ + | sed 's/ *$//' + ;; + DOB) + local parsed + parsed=$(date -d "$value" +%Y-%m-%d 2>/dev/null) \ + || parsed=$(date -j -f "%m/%d/%Y" "$value" +%Y-%m-%d 2>/dev/null) \ + || parsed=$(date -j -f "%Y-%m-%d" "$value" +%Y-%m-%d 2>/dev/null) + [ -n "$parsed" ] && printf '%s' "$parsed" || printf '%s' "$value" | tr -d '[:space:]' + ;; + SSN) + # Note: '-' must come at the END of the set so BSD tr doesn't treat + # it as an option flag. Same caveat applies for any future tr -d call. + printf '%s' "$value" | tr -d '[:space:]-' + ;; + *) + printf '%s' "$value" | awk '{$1=$1; print}' + ;; + esac +} + # tokenize-value: take a single value (and optional category) and return the -# existing or newly-created token from the lookup table. Used by larry.sh to -# preprocess {{phi:...}} placeholders BEFORE sending to the API. -# --category X force category; default = search all categories for a match, -# fall back to "MANUAL". +# existing or newly-created token. Auto-detects category if not given. Uses +# canonicalized form as the lookup key so name variants share one token. +# Table schema (v0.5.5+): token \t category \t canonical \t original +# Legacy 3-col rows (token \t category \t value) still readable. cmd_tokenize_value() { local table="$DEFAULT_TABLE" local category="" @@ -152,26 +230,26 @@ cmd_tokenize_value() { init_table "$table" - # Auto-category: search existing table for an exact value match. + # Auto-detect category from value shape if not explicitly forced. if [ -z "$category" ]; then - local existing - existing=$(awk -F'\t' -v v="$value" 'NR>1 && $3==v { print $1; exit }' "$table") - if [ -n "$existing" ]; then - printf '%s\n' "$existing" - return 0 - fi - category="MANUAL" + category=$(detect_category "$value") fi - # Look up (category, value) → existing token + # Canonicalize so different surface forms of the same PHI share one token. + local canonical + canonical=$(normalize_value "$value" "$category") + [ -z "$canonical" ] && canonical="$value" + + # Look up (category, canonical). Column 3 in both new (4-col) and legacy + # (3-col) rows is the lookup key, so this works for both. local tok - tok=$(awk -F'\t' -v cat="$category" -v v="$value" 'NR>1 && $2==cat && $3==v { print $1; exit }' "$table") + tok=$(awk -F'\t' -v cat="$category" -v c="$canonical" 'NR>1 && $2==cat && $3==c { print $1; exit }' "$table") if [ -n "$tok" ]; then printf '%s\n' "$tok" return 0 fi - # Create new + # Create new — write 4 columns. local nextnum nextnum=$(awk -F'\t' -v cat="$category" ' NR>1 && $2==cat && match($1, /_[0-9]+\]\]$/) { @@ -182,7 +260,7 @@ cmd_tokenize_value() { ' "$table") tok=$(printf '[[%s_%04d]]' "$category" "$nextnum") umask 077 - printf '%s\t%s\t%s\n' "$tok" "$category" "$value" >> "$table" + printf '%s\t%s\t%s\t%s\n' "$tok" "$category" "$canonical" "$value" >> "$table" chmod 600 "$table" 2>/dev/null || true printf '%s\n' "$tok" } @@ -202,7 +280,9 @@ cmd_detokenize_value() { [ -n "$token" ] || die "detokenize-value needs a TOKEN" [ -f "$table" ] || die "no lookup table at $table" local val - val=$(awk -F'\t' -v t="$token" 'NR>1 && $1==t { print $3; exit }' "$table") + # Prefer column 4 (original surface form) for new rows; fall back to col 3 + # for legacy 3-col rows where col 3 IS the original value. + val=$(awk -F'\t' -v t="$token" 'NR>1 && $1==t { print ($4 != "" ? $4 : $3); exit }' "$table") if [ -z "$val" ]; then printf 'no such token: %s\n' "$token" >&2 return 2 @@ -216,7 +296,7 @@ init_table() { chmod 700 "$(dirname "$table")" 2>/dev/null || true if [ ! -f "$table" ]; then umask 077 - printf 'token\tcategory\toriginal\n' > "$table" + printf 'token\tcategory\tcanonical\toriginal\n' > "$table" chmod 600 "$table" fi }