v0.5.5: @@VALUE inline PHI syntax + name canonicalization
Bryan asked for an easier-to-remember inline PHI marker than {{phi:VALUE}}
and for name forms like SMITH^JOHN / Smith, John / John Smith / JOHN SMITH
to all collapse to the same hash. Both shipped.
INLINE SYNTAX (in addition to the legacy {{phi:VALUE}} which still works):
@@VALUE unbracketed — VALUE has no whitespace
e.g. @@12345 @@SMITH^JOHN @@V789
@@VALUE@@ bracketed — VALUE may contain spaces
e.g. @@John Smith@@ @@Smith, John@@
Parser is 2-pass to disambiguate mixed forms in the same prompt: bracketed
markers are matched first (via grep -oE with a regex that excludes leading/
trailing whitespace inside the brackets), then the unbracketed pass scans
the remaining text. Verified against:
"look for @@12345 in PID.3 for @@John Smith@@ DOB @@01/15/1985 ..."
extracts 4 markers correctly and routes each to its category.
AUTO-CATEGORY DETECTION (lib/hl7-sanitize.sh: detect_category):
pure digits 4-15 → MRN
9 digits with dashes → SSN
date-shaped → DOB
caret or comma → NAME
2+ alpha tokens → NAME
else → MANUAL
CANONICALIZATION (lib/hl7-sanitize.sh: normalize_value):
NAME: lowercase, replace ',^/' with spaces, sort unique alpha tokens
SMITH^JOHN, Smith John, John Smith, JOHN SMITH → "john smith"
DOB: parse to YYYY-MM-DD (GNU date or BSD date fallback)
SSN: strip dashes/whitespace
MRN/MANUAL: trim outer whitespace only
TABLE SCHEMA bumped to 4 columns (token / category / canonical / original).
Legacy 3-column rows still read fine — lookups key on column 3 which is
"canonical" in new rows and "value" in legacy rows (mismatches just create
a new token, no corruption). Detokenize prefers column 4, falls back to
column 3 for legacy compat.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
af3f034337
commit
c2bba7be90
79
larry.sh
79
larry.sh
@ -36,7 +36,7 @@ set -o pipefail
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Config
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
LARRY_VERSION="0.5.4"
|
||||
LARRY_VERSION="0.5.5"
|
||||
LARRY_HOME="${LARRY_HOME:-$HOME/.larry}"
|
||||
LARRY_BASE_URL="${LARRY_BASE_URL:-https://raw.githubusercontent.com/bojj27/cloverleaf-larry/main}"
|
||||
LARRY_UPDATE_URL="${LARRY_UPDATE_URL:-${LARRY_BASE_URL}/larry.sh}"
|
||||
@ -740,23 +740,15 @@ preprocess_phi_markers() {
|
||||
local sanitize_script="$LARRY_LIB_DIR/hl7-sanitize.sh"
|
||||
[ -x "$sanitize_script" ] || { printf '%s' "$input"; return; }
|
||||
|
||||
# Use grep -oE to extract markers reliably across bash versions.
|
||||
local markers
|
||||
markers=$(printf '%s' "$input" | grep -oE '\{\{phi:[^{}]+\}\}' 2>/dev/null | sort -u)
|
||||
[ -z "$markers" ] && { printf '%s' "$input"; return; }
|
||||
# Three forms supported (processed in this order to avoid ambiguity):
|
||||
# 1. @@VALUE@@ bracketed; VALUE has no '@' and uses single-space word
|
||||
# separation. Use for values WITH spaces. Auto-detect category.
|
||||
# 2. @@VALUE unbracketed; VALUE has no whitespace or '@'. Auto-detect.
|
||||
# 3. {{phi:V}} / {{phi:CAT:V}} legacy, still supported.
|
||||
|
||||
while IFS= read -r marker; do
|
||||
[ -z "$marker" ] && continue
|
||||
# Strip {{phi: prefix and }} suffix
|
||||
local body="${marker#\{\{phi:}"
|
||||
body="${body%\}\}}"
|
||||
local category="" value=""
|
||||
if [[ "$body" == *:* ]] && [[ "${body%%:*}" =~ ^[A-Z][A-Z0-9_]+$ ]]; then
|
||||
category="${body%%:*}"
|
||||
value="${body#*:}"
|
||||
else
|
||||
value="$body"
|
||||
fi
|
||||
# Helper: tokenize one VALUE (optional category) and substitute MARKER → token.
|
||||
_phi_sub() {
|
||||
local marker="$1" value="$2" category="${3:-}"
|
||||
local args=(tokenize-value)
|
||||
[ -n "$category" ] && args+=(--category "$category")
|
||||
args+=("$value")
|
||||
@ -764,7 +756,45 @@ preprocess_phi_markers() {
|
||||
[ -z "$token" ] && token="[[PHI_ERROR]]"
|
||||
input="${input//"$marker"/"$token"}"
|
||||
printf '%sphi>%s %s → %s\n' "$C_YELLOW" "$C_RESET" "$marker" "$token" >&2
|
||||
done <<< "$markers"
|
||||
}
|
||||
|
||||
# Pass 1: bracketed @@VALUE@@ — value has no '@', allows internal single spaces
|
||||
# but no leading/trailing whitespace inside the brackets.
|
||||
local bracketed
|
||||
bracketed=$(printf '%s' "$input" | grep -oE '@@[^@[:space:]]+([ \t]+[^@[:space:]]+)*@@' 2>/dev/null | sort -u)
|
||||
while IFS= read -r marker; do
|
||||
[ -z "$marker" ] && continue
|
||||
local val="${marker#@@}"; val="${val%@@}"
|
||||
_phi_sub "$marker" "$val"
|
||||
done <<< "$bracketed"
|
||||
|
||||
# Pass 2: unbracketed @@VALUE — value has no whitespace or '@'. Anything that
|
||||
# was inside a bracketed marker has already been replaced with a [[TOK]], so
|
||||
# it won't be re-matched here.
|
||||
local unbracketed
|
||||
unbracketed=$(printf '%s' "$input" | grep -oE '@@[^@[:space:]]+' 2>/dev/null | sort -u)
|
||||
while IFS= read -r marker; do
|
||||
[ -z "$marker" ] && continue
|
||||
local val="${marker#@@}"
|
||||
_phi_sub "$marker" "$val"
|
||||
done <<< "$unbracketed"
|
||||
|
||||
# Pass 3: legacy {{phi:VALUE}} / {{phi:CATEGORY:VALUE}}.
|
||||
local legacy
|
||||
legacy=$(printf '%s' "$input" | grep -oE '\{\{phi:[^{}]+\}\}' 2>/dev/null | sort -u)
|
||||
while IFS= read -r marker; do
|
||||
[ -z "$marker" ] && continue
|
||||
local body="${marker#\{\{phi:}"; body="${body%\}\}}"
|
||||
local cat="" val=""
|
||||
if [[ "$body" == *:* ]] && [[ "${body%%:*}" =~ ^[A-Z][A-Z0-9_]+$ ]]; then
|
||||
cat="${body%%:*}"; val="${body#*:}"
|
||||
else
|
||||
val="$body"
|
||||
fi
|
||||
_phi_sub "$marker" "$val" "$cat"
|
||||
done <<< "$legacy"
|
||||
|
||||
unset -f _phi_sub
|
||||
printf '%s' "$input"
|
||||
}
|
||||
|
||||
@ -1067,9 +1097,14 @@ Slash commands:
|
||||
/tokens show the full local PHI ↔ token lookup table
|
||||
|
||||
PHI inline syntax in any prompt:
|
||||
{{phi:VALUE}} tokenize before send; auto-detects category
|
||||
{{phi:MRN:12345}} explicit category=MRN (matches sanitized data)
|
||||
{{phi:NAME:JOHN SMITH}} explicit category=NAME
|
||||
@@VALUE EASY: wrap PHI in @@. Spaceless = no end delim.
|
||||
e.g. @@12345 @@SMITH^JOHN @@V789
|
||||
@@VALUE@@ Use when VALUE has spaces.
|
||||
e.g. @@John Smith@@ @@Smith, John@@
|
||||
Name canonicalization: SMITH^JOHN, Smith, John, John Smith, JOHN SMITH
|
||||
all collapse to the same token.
|
||||
Category is auto-detected from value shape (MRN/SSN/DOB/NAME/MANUAL).
|
||||
{{phi:VALUE}} / {{phi:CAT:VALUE}} legacy syntax (still works)
|
||||
/redetect re-scan for HCIROOT/HCISITE/tools
|
||||
/sites list site dirs under HCIROOT
|
||||
/site <name> switch HCISITE for this session
|
||||
@ -1192,7 +1227,7 @@ main_loop() {
|
||||
|
||||
# PHI preprocessing: replace any {{phi:VALUE}} markers with local tokens
|
||||
# BEFORE the input enters conversation history and gets sent to Anthropic.
|
||||
if [[ "$input" == *"{{phi:"* ]]; then
|
||||
if [[ "$input" == *"{{phi:"* ]] || [[ "$input" == *"@@"* ]]; then
|
||||
input=$(preprocess_phi_markers "$input")
|
||||
fi
|
||||
|
||||
|
||||
@ -130,11 +130,89 @@ cmd_count() {
|
||||
echo $(($(wc -l < "$table") - 1))
|
||||
}
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Auto-category detection + value canonicalization.
|
||||
# Canonicalization makes "SMITH^JOHN", "Smith, John", "John Smith", and
|
||||
# "JOHN SMITH" all collapse to the same lookup key so they share one token.
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# detect_category VALUE — guess category from value shape.
|
||||
# Returns one of: MRN | SSN | DOB | NAME | MANUAL
|
||||
detect_category() {
|
||||
local v="$1"
|
||||
# Trim surrounding whitespace
|
||||
v="${v#"${v%%[![:space:]]*}"}"
|
||||
v="${v%"${v##*[![:space:]]}"}"
|
||||
|
||||
# Pure digits, 4-15 chars → MRN
|
||||
if [[ "$v" =~ ^[0-9]+$ ]] && [ "${#v}" -ge 4 ] && [ "${#v}" -le 15 ]; then
|
||||
printf 'MRN'; return
|
||||
fi
|
||||
# SSN: 9 digits with optional dashes
|
||||
if [[ "$v" =~ ^[0-9]{3}-?[0-9]{2}-?[0-9]{4}$ ]]; then
|
||||
printf 'SSN'; return
|
||||
fi
|
||||
# Date-like
|
||||
if [[ "$v" =~ ^[0-9]{1,4}[/-][0-9]{1,2}[/-][0-9]{1,4}$ ]]; then
|
||||
printf 'DOB'; return
|
||||
fi
|
||||
# HL7 caret or comma → likely name
|
||||
if [[ "$v" == *"^"* ]] || [[ "$v" == *","* ]]; then
|
||||
printf 'NAME'; return
|
||||
fi
|
||||
# Two+ alpha tokens → name
|
||||
local word_count
|
||||
word_count=$(printf '%s\n' "$v" | awk '{ c=0; for(i=1;i<=NF;i++) if ($i ~ /^[A-Za-z]+$/) c++; print c }')
|
||||
if [ "${word_count:-0}" -ge 2 ]; then
|
||||
printf 'NAME'; return
|
||||
fi
|
||||
printf 'MANUAL'
|
||||
}
|
||||
|
||||
# normalize_value VALUE CATEGORY — produce canonical lookup key.
|
||||
# NAME: lowercase, strip punctuation, sort unique alpha tokens. So
|
||||
# "SMITH^JOHN", "Smith, John", "John Smith", "JOHN SMITH" → "john smith"
|
||||
# DOB: parse to YYYY-MM-DD if possible (needs GNU date or BSD date)
|
||||
# SSN: strip dashes/whitespace
|
||||
# MRN/MANUAL: just trim outer whitespace
|
||||
normalize_value() {
|
||||
local value="$1"
|
||||
local category="$2"
|
||||
case "$category" in
|
||||
NAME)
|
||||
printf '%s' "$value" \
|
||||
| tr '[:upper:]' '[:lower:]' \
|
||||
| tr ',^/' ' ' \
|
||||
| tr -cs '[:alnum:]' ' ' \
|
||||
| tr ' ' '\n' \
|
||||
| grep -E '^[a-z][a-z0-9]*$' \
|
||||
| sort -u \
|
||||
| tr '\n' ' ' \
|
||||
| sed 's/ *$//'
|
||||
;;
|
||||
DOB)
|
||||
local parsed
|
||||
parsed=$(date -d "$value" +%Y-%m-%d 2>/dev/null) \
|
||||
|| parsed=$(date -j -f "%m/%d/%Y" "$value" +%Y-%m-%d 2>/dev/null) \
|
||||
|| parsed=$(date -j -f "%Y-%m-%d" "$value" +%Y-%m-%d 2>/dev/null)
|
||||
[ -n "$parsed" ] && printf '%s' "$parsed" || printf '%s' "$value" | tr -d '[:space:]'
|
||||
;;
|
||||
SSN)
|
||||
# Note: '-' must come at the END of the set so BSD tr doesn't treat
|
||||
# it as an option flag. Same caveat applies for any future tr -d call.
|
||||
printf '%s' "$value" | tr -d '[:space:]-'
|
||||
;;
|
||||
*)
|
||||
printf '%s' "$value" | awk '{$1=$1; print}'
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# tokenize-value: take a single value (and optional category) and return the
|
||||
# existing or newly-created token from the lookup table. Used by larry.sh to
|
||||
# preprocess {{phi:...}} placeholders BEFORE sending to the API.
|
||||
# --category X force category; default = search all categories for a match,
|
||||
# fall back to "MANUAL".
|
||||
# existing or newly-created token. Auto-detects category if not given. Uses
|
||||
# canonicalized form as the lookup key so name variants share one token.
|
||||
# Table schema (v0.5.5+): token \t category \t canonical \t original
|
||||
# Legacy 3-col rows (token \t category \t value) still readable.
|
||||
cmd_tokenize_value() {
|
||||
local table="$DEFAULT_TABLE"
|
||||
local category=""
|
||||
@ -152,26 +230,26 @@ cmd_tokenize_value() {
|
||||
|
||||
init_table "$table"
|
||||
|
||||
# Auto-category: search existing table for an exact value match.
|
||||
# Auto-detect category from value shape if not explicitly forced.
|
||||
if [ -z "$category" ]; then
|
||||
local existing
|
||||
existing=$(awk -F'\t' -v v="$value" 'NR>1 && $3==v { print $1; exit }' "$table")
|
||||
if [ -n "$existing" ]; then
|
||||
printf '%s\n' "$existing"
|
||||
return 0
|
||||
fi
|
||||
category="MANUAL"
|
||||
category=$(detect_category "$value")
|
||||
fi
|
||||
|
||||
# Look up (category, value) → existing token
|
||||
# Canonicalize so different surface forms of the same PHI share one token.
|
||||
local canonical
|
||||
canonical=$(normalize_value "$value" "$category")
|
||||
[ -z "$canonical" ] && canonical="$value"
|
||||
|
||||
# Look up (category, canonical). Column 3 in both new (4-col) and legacy
|
||||
# (3-col) rows is the lookup key, so this works for both.
|
||||
local tok
|
||||
tok=$(awk -F'\t' -v cat="$category" -v v="$value" 'NR>1 && $2==cat && $3==v { print $1; exit }' "$table")
|
||||
tok=$(awk -F'\t' -v cat="$category" -v c="$canonical" 'NR>1 && $2==cat && $3==c { print $1; exit }' "$table")
|
||||
if [ -n "$tok" ]; then
|
||||
printf '%s\n' "$tok"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Create new
|
||||
# Create new — write 4 columns.
|
||||
local nextnum
|
||||
nextnum=$(awk -F'\t' -v cat="$category" '
|
||||
NR>1 && $2==cat && match($1, /_[0-9]+\]\]$/) {
|
||||
@ -182,7 +260,7 @@ cmd_tokenize_value() {
|
||||
' "$table")
|
||||
tok=$(printf '[[%s_%04d]]' "$category" "$nextnum")
|
||||
umask 077
|
||||
printf '%s\t%s\t%s\n' "$tok" "$category" "$value" >> "$table"
|
||||
printf '%s\t%s\t%s\t%s\n' "$tok" "$category" "$canonical" "$value" >> "$table"
|
||||
chmod 600 "$table" 2>/dev/null || true
|
||||
printf '%s\n' "$tok"
|
||||
}
|
||||
@ -202,7 +280,9 @@ cmd_detokenize_value() {
|
||||
[ -n "$token" ] || die "detokenize-value needs a TOKEN"
|
||||
[ -f "$table" ] || die "no lookup table at $table"
|
||||
local val
|
||||
val=$(awk -F'\t' -v t="$token" 'NR>1 && $1==t { print $3; exit }' "$table")
|
||||
# Prefer column 4 (original surface form) for new rows; fall back to col 3
|
||||
# for legacy 3-col rows where col 3 IS the original value.
|
||||
val=$(awk -F'\t' -v t="$token" 'NR>1 && $1==t { print ($4 != "" ? $4 : $3); exit }' "$table")
|
||||
if [ -z "$val" ]; then
|
||||
printf 'no such token: %s\n' "$token" >&2
|
||||
return 2
|
||||
@ -216,7 +296,7 @@ init_table() {
|
||||
chmod 700 "$(dirname "$table")" 2>/dev/null || true
|
||||
if [ ! -f "$table" ]; then
|
||||
umask 077
|
||||
printf 'token\tcategory\toriginal\n' > "$table"
|
||||
printf 'token\tcategory\tcanonical\toriginal\n' > "$table"
|
||||
chmod 600 "$table"
|
||||
fi
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user