v0.5.5: @@VALUE inline PHI syntax + name canonicalization
Bryan asked for an easier-to-remember inline PHI marker than {{phi:VALUE}}
and for name forms like SMITH^JOHN / Smith, John / John Smith / JOHN SMITH
to all collapse to the same hash. Both shipped.
INLINE SYNTAX (in addition to the legacy {{phi:VALUE}} which still works):
@@VALUE unbracketed — VALUE has no whitespace
e.g. @@12345 @@SMITH^JOHN @@V789
@@VALUE@@ bracketed — VALUE may contain spaces
e.g. @@John Smith@@ @@Smith, John@@
Parser is 2-pass to disambiguate mixed forms in the same prompt: bracketed
markers are matched first (via grep -oE with a regex that excludes leading/
trailing whitespace inside the brackets), then the unbracketed pass scans
the remaining text. Verified against:
"look for @@12345 in PID.3 for @@John Smith@@ DOB @@01/15/1985 ..."
extracts 4 markers correctly and routes each to its category.
AUTO-CATEGORY DETECTION (lib/hl7-sanitize.sh: detect_category):
pure digits 4-15 → MRN
9 digits with dashes → SSN
date-shaped → DOB
caret or comma → NAME
2+ alpha tokens → NAME
else → MANUAL
CANONICALIZATION (lib/hl7-sanitize.sh: normalize_value):
NAME: lowercase, replace ',^/' with spaces, sort unique alpha tokens
SMITH^JOHN, Smith John, John Smith, JOHN SMITH → "john smith"
DOB: parse to YYYY-MM-DD (GNU date or BSD date fallback)
SSN: strip dashes/whitespace
MRN/MANUAL: trim outer whitespace only
TABLE SCHEMA bumped to 4 columns (token / category / canonical / original).
Legacy 3-column rows still read fine — lookups key on column 3 which is
"canonical" in new rows and "value" in legacy rows (mismatches just create
a new token, no corruption). Detokenize prefers column 4, falls back to
column 3 for legacy compat.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
af3f034337
commit
c2bba7be90
79
larry.sh
79
larry.sh
@ -36,7 +36,7 @@ set -o pipefail
|
|||||||
# ─────────────────────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
# Config
|
# Config
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
LARRY_VERSION="0.5.4"
|
LARRY_VERSION="0.5.5"
|
||||||
LARRY_HOME="${LARRY_HOME:-$HOME/.larry}"
|
LARRY_HOME="${LARRY_HOME:-$HOME/.larry}"
|
||||||
LARRY_BASE_URL="${LARRY_BASE_URL:-https://raw.githubusercontent.com/bojj27/cloverleaf-larry/main}"
|
LARRY_BASE_URL="${LARRY_BASE_URL:-https://raw.githubusercontent.com/bojj27/cloverleaf-larry/main}"
|
||||||
LARRY_UPDATE_URL="${LARRY_UPDATE_URL:-${LARRY_BASE_URL}/larry.sh}"
|
LARRY_UPDATE_URL="${LARRY_UPDATE_URL:-${LARRY_BASE_URL}/larry.sh}"
|
||||||
@ -740,23 +740,15 @@ preprocess_phi_markers() {
|
|||||||
local sanitize_script="$LARRY_LIB_DIR/hl7-sanitize.sh"
|
local sanitize_script="$LARRY_LIB_DIR/hl7-sanitize.sh"
|
||||||
[ -x "$sanitize_script" ] || { printf '%s' "$input"; return; }
|
[ -x "$sanitize_script" ] || { printf '%s' "$input"; return; }
|
||||||
|
|
||||||
# Use grep -oE to extract markers reliably across bash versions.
|
# Three forms supported (processed in this order to avoid ambiguity):
|
||||||
local markers
|
# 1. @@VALUE@@ bracketed; VALUE has no '@' and uses single-space word
|
||||||
markers=$(printf '%s' "$input" | grep -oE '\{\{phi:[^{}]+\}\}' 2>/dev/null | sort -u)
|
# separation. Use for values WITH spaces. Auto-detect category.
|
||||||
[ -z "$markers" ] && { printf '%s' "$input"; return; }
|
# 2. @@VALUE unbracketed; VALUE has no whitespace or '@'. Auto-detect.
|
||||||
|
# 3. {{phi:V}} / {{phi:CAT:V}} legacy, still supported.
|
||||||
|
|
||||||
while IFS= read -r marker; do
|
# Helper: tokenize one VALUE (optional category) and substitute MARKER → token.
|
||||||
[ -z "$marker" ] && continue
|
_phi_sub() {
|
||||||
# Strip {{phi: prefix and }} suffix
|
local marker="$1" value="$2" category="${3:-}"
|
||||||
local body="${marker#\{\{phi:}"
|
|
||||||
body="${body%\}\}}"
|
|
||||||
local category="" value=""
|
|
||||||
if [[ "$body" == *:* ]] && [[ "${body%%:*}" =~ ^[A-Z][A-Z0-9_]+$ ]]; then
|
|
||||||
category="${body%%:*}"
|
|
||||||
value="${body#*:}"
|
|
||||||
else
|
|
||||||
value="$body"
|
|
||||||
fi
|
|
||||||
local args=(tokenize-value)
|
local args=(tokenize-value)
|
||||||
[ -n "$category" ] && args+=(--category "$category")
|
[ -n "$category" ] && args+=(--category "$category")
|
||||||
args+=("$value")
|
args+=("$value")
|
||||||
@ -764,7 +756,45 @@ preprocess_phi_markers() {
|
|||||||
[ -z "$token" ] && token="[[PHI_ERROR]]"
|
[ -z "$token" ] && token="[[PHI_ERROR]]"
|
||||||
input="${input//"$marker"/"$token"}"
|
input="${input//"$marker"/"$token"}"
|
||||||
printf '%sphi>%s %s → %s\n' "$C_YELLOW" "$C_RESET" "$marker" "$token" >&2
|
printf '%sphi>%s %s → %s\n' "$C_YELLOW" "$C_RESET" "$marker" "$token" >&2
|
||||||
done <<< "$markers"
|
}
|
||||||
|
|
||||||
|
# Pass 1: bracketed @@VALUE@@ — value has no '@', allows internal single spaces
|
||||||
|
# but no leading/trailing whitespace inside the brackets.
|
||||||
|
local bracketed
|
||||||
|
bracketed=$(printf '%s' "$input" | grep -oE '@@[^@[:space:]]+([ \t]+[^@[:space:]]+)*@@' 2>/dev/null | sort -u)
|
||||||
|
while IFS= read -r marker; do
|
||||||
|
[ -z "$marker" ] && continue
|
||||||
|
local val="${marker#@@}"; val="${val%@@}"
|
||||||
|
_phi_sub "$marker" "$val"
|
||||||
|
done <<< "$bracketed"
|
||||||
|
|
||||||
|
# Pass 2: unbracketed @@VALUE — value has no whitespace or '@'. Anything that
|
||||||
|
# was inside a bracketed marker has already been replaced with a [[TOK]], so
|
||||||
|
# it won't be re-matched here.
|
||||||
|
local unbracketed
|
||||||
|
unbracketed=$(printf '%s' "$input" | grep -oE '@@[^@[:space:]]+' 2>/dev/null | sort -u)
|
||||||
|
while IFS= read -r marker; do
|
||||||
|
[ -z "$marker" ] && continue
|
||||||
|
local val="${marker#@@}"
|
||||||
|
_phi_sub "$marker" "$val"
|
||||||
|
done <<< "$unbracketed"
|
||||||
|
|
||||||
|
# Pass 3: legacy {{phi:VALUE}} / {{phi:CATEGORY:VALUE}}.
|
||||||
|
local legacy
|
||||||
|
legacy=$(printf '%s' "$input" | grep -oE '\{\{phi:[^{}]+\}\}' 2>/dev/null | sort -u)
|
||||||
|
while IFS= read -r marker; do
|
||||||
|
[ -z "$marker" ] && continue
|
||||||
|
local body="${marker#\{\{phi:}"; body="${body%\}\}}"
|
||||||
|
local cat="" val=""
|
||||||
|
if [[ "$body" == *:* ]] && [[ "${body%%:*}" =~ ^[A-Z][A-Z0-9_]+$ ]]; then
|
||||||
|
cat="${body%%:*}"; val="${body#*:}"
|
||||||
|
else
|
||||||
|
val="$body"
|
||||||
|
fi
|
||||||
|
_phi_sub "$marker" "$val" "$cat"
|
||||||
|
done <<< "$legacy"
|
||||||
|
|
||||||
|
unset -f _phi_sub
|
||||||
printf '%s' "$input"
|
printf '%s' "$input"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1067,9 +1097,14 @@ Slash commands:
|
|||||||
/tokens show the full local PHI ↔ token lookup table
|
/tokens show the full local PHI ↔ token lookup table
|
||||||
|
|
||||||
PHI inline syntax in any prompt:
|
PHI inline syntax in any prompt:
|
||||||
{{phi:VALUE}} tokenize before send; auto-detects category
|
@@VALUE EASY: wrap PHI in @@. Spaceless = no end delim.
|
||||||
{{phi:MRN:12345}} explicit category=MRN (matches sanitized data)
|
e.g. @@12345 @@SMITH^JOHN @@V789
|
||||||
{{phi:NAME:JOHN SMITH}} explicit category=NAME
|
@@VALUE@@ Use when VALUE has spaces.
|
||||||
|
e.g. @@John Smith@@ @@Smith, John@@
|
||||||
|
Name canonicalization: SMITH^JOHN, Smith, John, John Smith, JOHN SMITH
|
||||||
|
all collapse to the same token.
|
||||||
|
Category is auto-detected from value shape (MRN/SSN/DOB/NAME/MANUAL).
|
||||||
|
{{phi:VALUE}} / {{phi:CAT:VALUE}} legacy syntax (still works)
|
||||||
/redetect re-scan for HCIROOT/HCISITE/tools
|
/redetect re-scan for HCIROOT/HCISITE/tools
|
||||||
/sites list site dirs under HCIROOT
|
/sites list site dirs under HCIROOT
|
||||||
/site <name> switch HCISITE for this session
|
/site <name> switch HCISITE for this session
|
||||||
@ -1192,7 +1227,7 @@ main_loop() {
|
|||||||
|
|
||||||
# PHI preprocessing: replace any {{phi:VALUE}} markers with local tokens
|
# PHI preprocessing: replace any {{phi:VALUE}} markers with local tokens
|
||||||
# BEFORE the input enters conversation history and gets sent to Anthropic.
|
# BEFORE the input enters conversation history and gets sent to Anthropic.
|
||||||
if [[ "$input" == *"{{phi:"* ]]; then
|
if [[ "$input" == *"{{phi:"* ]] || [[ "$input" == *"@@"* ]]; then
|
||||||
input=$(preprocess_phi_markers "$input")
|
input=$(preprocess_phi_markers "$input")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@ -130,11 +130,89 @@ cmd_count() {
|
|||||||
echo $(($(wc -l < "$table") - 1))
|
echo $(($(wc -l < "$table") - 1))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Auto-category detection + value canonicalization.
|
||||||
|
# Canonicalization makes "SMITH^JOHN", "Smith, John", "John Smith", and
|
||||||
|
# "JOHN SMITH" all collapse to the same lookup key so they share one token.
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# detect_category VALUE — guess category from value shape.
|
||||||
|
# Returns one of: MRN | SSN | DOB | NAME | MANUAL
|
||||||
|
detect_category() {
|
||||||
|
local v="$1"
|
||||||
|
# Trim surrounding whitespace
|
||||||
|
v="${v#"${v%%[![:space:]]*}"}"
|
||||||
|
v="${v%"${v##*[![:space:]]}"}"
|
||||||
|
|
||||||
|
# Pure digits, 4-15 chars → MRN
|
||||||
|
if [[ "$v" =~ ^[0-9]+$ ]] && [ "${#v}" -ge 4 ] && [ "${#v}" -le 15 ]; then
|
||||||
|
printf 'MRN'; return
|
||||||
|
fi
|
||||||
|
# SSN: 9 digits with optional dashes
|
||||||
|
if [[ "$v" =~ ^[0-9]{3}-?[0-9]{2}-?[0-9]{4}$ ]]; then
|
||||||
|
printf 'SSN'; return
|
||||||
|
fi
|
||||||
|
# Date-like
|
||||||
|
if [[ "$v" =~ ^[0-9]{1,4}[/-][0-9]{1,2}[/-][0-9]{1,4}$ ]]; then
|
||||||
|
printf 'DOB'; return
|
||||||
|
fi
|
||||||
|
# HL7 caret or comma → likely name
|
||||||
|
if [[ "$v" == *"^"* ]] || [[ "$v" == *","* ]]; then
|
||||||
|
printf 'NAME'; return
|
||||||
|
fi
|
||||||
|
# Two+ alpha tokens → name
|
||||||
|
local word_count
|
||||||
|
word_count=$(printf '%s\n' "$v" | awk '{ c=0; for(i=1;i<=NF;i++) if ($i ~ /^[A-Za-z]+$/) c++; print c }')
|
||||||
|
if [ "${word_count:-0}" -ge 2 ]; then
|
||||||
|
printf 'NAME'; return
|
||||||
|
fi
|
||||||
|
printf 'MANUAL'
|
||||||
|
}
|
||||||
|
|
||||||
|
# normalize_value VALUE CATEGORY — produce canonical lookup key.
|
||||||
|
# NAME: lowercase, strip punctuation, sort unique alpha tokens. So
|
||||||
|
# "SMITH^JOHN", "Smith, John", "John Smith", "JOHN SMITH" → "john smith"
|
||||||
|
# DOB: parse to YYYY-MM-DD if possible (needs GNU date or BSD date)
|
||||||
|
# SSN: strip dashes/whitespace
|
||||||
|
# MRN/MANUAL: just trim outer whitespace
|
||||||
|
normalize_value() {
|
||||||
|
local value="$1"
|
||||||
|
local category="$2"
|
||||||
|
case "$category" in
|
||||||
|
NAME)
|
||||||
|
printf '%s' "$value" \
|
||||||
|
| tr '[:upper:]' '[:lower:]' \
|
||||||
|
| tr ',^/' ' ' \
|
||||||
|
| tr -cs '[:alnum:]' ' ' \
|
||||||
|
| tr ' ' '\n' \
|
||||||
|
| grep -E '^[a-z][a-z0-9]*$' \
|
||||||
|
| sort -u \
|
||||||
|
| tr '\n' ' ' \
|
||||||
|
| sed 's/ *$//'
|
||||||
|
;;
|
||||||
|
DOB)
|
||||||
|
local parsed
|
||||||
|
parsed=$(date -d "$value" +%Y-%m-%d 2>/dev/null) \
|
||||||
|
|| parsed=$(date -j -f "%m/%d/%Y" "$value" +%Y-%m-%d 2>/dev/null) \
|
||||||
|
|| parsed=$(date -j -f "%Y-%m-%d" "$value" +%Y-%m-%d 2>/dev/null)
|
||||||
|
[ -n "$parsed" ] && printf '%s' "$parsed" || printf '%s' "$value" | tr -d '[:space:]'
|
||||||
|
;;
|
||||||
|
SSN)
|
||||||
|
# Note: '-' must come at the END of the set so BSD tr doesn't treat
|
||||||
|
# it as an option flag. Same caveat applies for any future tr -d call.
|
||||||
|
printf '%s' "$value" | tr -d '[:space:]-'
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
printf '%s' "$value" | awk '{$1=$1; print}'
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
# tokenize-value: take a single value (and optional category) and return the
|
# tokenize-value: take a single value (and optional category) and return the
|
||||||
# existing or newly-created token from the lookup table. Used by larry.sh to
|
# existing or newly-created token. Auto-detects category if not given. Uses
|
||||||
# preprocess {{phi:...}} placeholders BEFORE sending to the API.
|
# canonicalized form as the lookup key so name variants share one token.
|
||||||
# --category X force category; default = search all categories for a match,
|
# Table schema (v0.5.5+): token \t category \t canonical \t original
|
||||||
# fall back to "MANUAL".
|
# Legacy 3-col rows (token \t category \t value) still readable.
|
||||||
cmd_tokenize_value() {
|
cmd_tokenize_value() {
|
||||||
local table="$DEFAULT_TABLE"
|
local table="$DEFAULT_TABLE"
|
||||||
local category=""
|
local category=""
|
||||||
@ -152,26 +230,26 @@ cmd_tokenize_value() {
|
|||||||
|
|
||||||
init_table "$table"
|
init_table "$table"
|
||||||
|
|
||||||
# Auto-category: search existing table for an exact value match.
|
# Auto-detect category from value shape if not explicitly forced.
|
||||||
if [ -z "$category" ]; then
|
if [ -z "$category" ]; then
|
||||||
local existing
|
category=$(detect_category "$value")
|
||||||
existing=$(awk -F'\t' -v v="$value" 'NR>1 && $3==v { print $1; exit }' "$table")
|
|
||||||
if [ -n "$existing" ]; then
|
|
||||||
printf '%s\n' "$existing"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
category="MANUAL"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Look up (category, value) → existing token
|
# Canonicalize so different surface forms of the same PHI share one token.
|
||||||
|
local canonical
|
||||||
|
canonical=$(normalize_value "$value" "$category")
|
||||||
|
[ -z "$canonical" ] && canonical="$value"
|
||||||
|
|
||||||
|
# Look up (category, canonical). Column 3 in both new (4-col) and legacy
|
||||||
|
# (3-col) rows is the lookup key, so this works for both.
|
||||||
local tok
|
local tok
|
||||||
tok=$(awk -F'\t' -v cat="$category" -v v="$value" 'NR>1 && $2==cat && $3==v { print $1; exit }' "$table")
|
tok=$(awk -F'\t' -v cat="$category" -v c="$canonical" 'NR>1 && $2==cat && $3==c { print $1; exit }' "$table")
|
||||||
if [ -n "$tok" ]; then
|
if [ -n "$tok" ]; then
|
||||||
printf '%s\n' "$tok"
|
printf '%s\n' "$tok"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Create new
|
# Create new — write 4 columns.
|
||||||
local nextnum
|
local nextnum
|
||||||
nextnum=$(awk -F'\t' -v cat="$category" '
|
nextnum=$(awk -F'\t' -v cat="$category" '
|
||||||
NR>1 && $2==cat && match($1, /_[0-9]+\]\]$/) {
|
NR>1 && $2==cat && match($1, /_[0-9]+\]\]$/) {
|
||||||
@ -182,7 +260,7 @@ cmd_tokenize_value() {
|
|||||||
' "$table")
|
' "$table")
|
||||||
tok=$(printf '[[%s_%04d]]' "$category" "$nextnum")
|
tok=$(printf '[[%s_%04d]]' "$category" "$nextnum")
|
||||||
umask 077
|
umask 077
|
||||||
printf '%s\t%s\t%s\n' "$tok" "$category" "$value" >> "$table"
|
printf '%s\t%s\t%s\t%s\n' "$tok" "$category" "$canonical" "$value" >> "$table"
|
||||||
chmod 600 "$table" 2>/dev/null || true
|
chmod 600 "$table" 2>/dev/null || true
|
||||||
printf '%s\n' "$tok"
|
printf '%s\n' "$tok"
|
||||||
}
|
}
|
||||||
@ -202,7 +280,9 @@ cmd_detokenize_value() {
|
|||||||
[ -n "$token" ] || die "detokenize-value needs a TOKEN"
|
[ -n "$token" ] || die "detokenize-value needs a TOKEN"
|
||||||
[ -f "$table" ] || die "no lookup table at $table"
|
[ -f "$table" ] || die "no lookup table at $table"
|
||||||
local val
|
local val
|
||||||
val=$(awk -F'\t' -v t="$token" 'NR>1 && $1==t { print $3; exit }' "$table")
|
# Prefer column 4 (original surface form) for new rows; fall back to col 3
|
||||||
|
# for legacy 3-col rows where col 3 IS the original value.
|
||||||
|
val=$(awk -F'\t' -v t="$token" 'NR>1 && $1==t { print ($4 != "" ? $4 : $3); exit }' "$table")
|
||||||
if [ -z "$val" ]; then
|
if [ -z "$val" ]; then
|
||||||
printf 'no such token: %s\n' "$token" >&2
|
printf 'no such token: %s\n' "$token" >&2
|
||||||
return 2
|
return 2
|
||||||
@ -216,7 +296,7 @@ init_table() {
|
|||||||
chmod 700 "$(dirname "$table")" 2>/dev/null || true
|
chmod 700 "$(dirname "$table")" 2>/dev/null || true
|
||||||
if [ ! -f "$table" ]; then
|
if [ ! -f "$table" ]; then
|
||||||
umask 077
|
umask 077
|
||||||
printf 'token\tcategory\toriginal\n' > "$table"
|
printf 'token\tcategory\tcanonical\toriginal\n' > "$table"
|
||||||
chmod 600 "$table"
|
chmod 600 "$table"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user