v0.5.5: @@VALUE inline PHI syntax + name canonicalization

Bryan asked for an easier-to-remember inline PHI marker than {{phi:VALUE}} and for name forms like SMITH^JOHN / Smith, John / John Smith / JOHN SMITH to all collapse to the same hash. Both shipped. INLINE SYNTAX (in addition to the legacy {{phi:VALUE}} which still works): @@VALUE unbracketed — VALUE has no whitespace e.g. @@12345 @@SMITH^JOHN @@V789 @@VALUE@@ bracketed — VALUE may contain spaces e.g. @@John Smith@@ @@Smith, John@@ Parser is 2-pass to disambiguate mixed forms in the same prompt: bracketed markers are matched first (via grep -oE with a regex that excludes leading/ trailing whitespace inside the brackets), then the unbracketed pass scans the remaining text. Verified against: "look for @@12345 in PID.3 for @@John Smith@@ DOB @@01/15/1985 ..." extracts 4 markers correctly and routes each to its category. AUTO-CATEGORY DETECTION (lib/hl7-sanitize.sh: detect_category): pure digits 4-15 → MRN 9 digits with dashes → SSN date-shaped → DOB caret or comma → NAME 2+ alpha tokens → NAME else → MANUAL CANONICALIZATION (lib/hl7-sanitize.sh: normalize_value): NAME: lowercase, replace ',^/' with spaces, sort unique alpha tokens SMITH^JOHN, Smith John, John Smith, JOHN SMITH → "john smith" DOB: parse to YYYY-MM-DD (GNU date or BSD date fallback) SSN: strip dashes/whitespace MRN/MANUAL: trim outer whitespace only TABLE SCHEMA bumped to 4 columns (token / category / canonical / original). Legacy 3-column rows still read fine — lookups key on column 3 which is "canonical" in new rows and "value" in legacy rows (mismatches just create a new token, no corruption). Detokenize prefers column 4, falls back to column 3 for legacy compat. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 10:11:18 -07:00 · 2026-05-27 10:11:18 -07:00 · c2bba7be90
commit c2bba7be90
parent af3f034337
3 changed files with 156 additions and 41 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-0.5.4
+0.5.5
--- a/larry.sh
+++ b/larry.sh
@ -36,7 +36,7 @@ set -o pipefail
 # ─────────────────────────────────────────────────────────────────────────────
 # Config
 # ─────────────────────────────────────────────────────────────────────────────
-LARRY_VERSION="0.5.4"
+LARRY_VERSION="0.5.5"
 LARRY_HOME="${LARRY_HOME:-$HOME/.larry}"
 LARRY_BASE_URL="${LARRY_BASE_URL:-https://raw.githubusercontent.com/bojj27/cloverleaf-larry/main}"
 LARRY_UPDATE_URL="${LARRY_UPDATE_URL:-${LARRY_BASE_URL}/larry.sh}"
@ -740,23 +740,15 @@ preprocess_phi_markers() {
  local sanitize_script="$LARRY_LIB_DIR/hl7-sanitize.sh"
  [ -x "$sanitize_script" ] || { printf '%s' "$input"; return; }

-  # Use grep -oE to extract markers reliably across bash versions.
-  local markers
-  markers=$(printf '%s' "$input" | grep -oE '\{\{phi:[^{}]+\}\}' 2>/dev/null | sort -u)
-  [ -z "$markers" ] && { printf '%s' "$input"; return; }
+  # Three forms supported (processed in this order to avoid ambiguity):
+  #   1. @@VALUE@@   bracketed; VALUE has no '@' and uses single-space word
+  #                  separation. Use for values WITH spaces. Auto-detect category.
+  #   2. @@VALUE     unbracketed; VALUE has no whitespace or '@'. Auto-detect.
+  #   3. {{phi:V}} / {{phi:CAT:V}}   legacy, still supported.

-  while IFS= read -r marker; do
-    [ -z "$marker" ] && continue
-    # Strip {{phi: prefix and }} suffix
-    local body="${marker#\{\{phi:}"
-    body="${body%\}\}}"
-    local category="" value=""
-    if [[ "$body" == *:* ]] && [[ "${body%%:*}" =~ ^[A-Z][A-Z0-9_]+$ ]]; then
-      category="${body%%:*}"
-      value="${body#*:}"
-    else
-      value="$body"
-    fi
+  # Helper: tokenize one VALUE (optional category) and substitute MARKER → token.
+  _phi_sub() {
+    local marker="$1" value="$2" category="${3:-}"
    local args=(tokenize-value)
    [ -n "$category" ] && args+=(--category "$category")
    args+=("$value")
@ -764,7 +756,45 @@ preprocess_phi_markers() {
    [ -z "$token" ] && token="[[PHI_ERROR]]"
    input="${input//"$marker"/"$token"}"
    printf '%sphi>%s %s → %s\n' "$C_YELLOW" "$C_RESET" "$marker" "$token" >&2
-  done <<< "$markers"
+  }
+
+  # Pass 1: bracketed @@VALUE@@ — value has no '@', allows internal single spaces
+  # but no leading/trailing whitespace inside the brackets.
+  local bracketed
+  bracketed=$(printf '%s' "$input" | grep -oE '@@[^@[:space:]]+([ \t]+[^@[:space:]]+)*@@' 2>/dev/null | sort -u)
+  while IFS= read -r marker; do
+    [ -z "$marker" ] && continue
+    local val="${marker#@@}"; val="${val%@@}"
+    _phi_sub "$marker" "$val"
+  done <<< "$bracketed"
+
+  # Pass 2: unbracketed @@VALUE — value has no whitespace or '@'. Anything that
+  # was inside a bracketed marker has already been replaced with a [[TOK]], so
+  # it won't be re-matched here.
+  local unbracketed
+  unbracketed=$(printf '%s' "$input" | grep -oE '@@[^@[:space:]]+' 2>/dev/null | sort -u)
+  while IFS= read -r marker; do
+    [ -z "$marker" ] && continue
+    local val="${marker#@@}"
+    _phi_sub "$marker" "$val"
+  done <<< "$unbracketed"
+
+  # Pass 3: legacy {{phi:VALUE}} / {{phi:CATEGORY:VALUE}}.
+  local legacy
+  legacy=$(printf '%s' "$input" | grep -oE '\{\{phi:[^{}]+\}\}' 2>/dev/null | sort -u)
+  while IFS= read -r marker; do
+    [ -z "$marker" ] && continue
+    local body="${marker#\{\{phi:}"; body="${body%\}\}}"
+    local cat="" val=""
+    if [[ "$body" == *:* ]] && [[ "${body%%:*}" =~ ^[A-Z][A-Z0-9_]+$ ]]; then
+      cat="${body%%:*}"; val="${body#*:}"
+    else
+      val="$body"
+    fi
+    _phi_sub "$marker" "$val" "$cat"
+  done <<< "$legacy"
+
+  unset -f _phi_sub
  printf '%s' "$input"
 }

@ -1067,9 +1097,14 @@ Slash commands:
  /tokens            show the full local PHI ↔ token lookup table

  PHI inline syntax in any prompt:
-    {{phi:VALUE}}              tokenize before send; auto-detects category
-    {{phi:MRN:12345}}          explicit category=MRN (matches sanitized data)
-    {{phi:NAME:JOHN SMITH}}    explicit category=NAME
+    @@VALUE                    EASY: wrap PHI in @@. Spaceless = no end delim.
+                                 e.g.  @@12345    @@SMITH^JOHN    @@V789
+    @@VALUE@@                  Use when VALUE has spaces.
+                                 e.g.  @@John Smith@@   @@Smith, John@@
+    Name canonicalization: SMITH^JOHN, Smith, John, John Smith, JOHN SMITH
+                                 all collapse to the same token.
+    Category is auto-detected from value shape (MRN/SSN/DOB/NAME/MANUAL).
+    {{phi:VALUE}} / {{phi:CAT:VALUE}}  legacy syntax (still works)
  /redetect          re-scan for HCIROOT/HCISITE/tools
  /sites             list site dirs under HCIROOT
  /site <name>       switch HCISITE for this session
@ -1192,7 +1227,7 @@ main_loop() {

    # PHI preprocessing: replace any {{phi:VALUE}} markers with local tokens
    # BEFORE the input enters conversation history and gets sent to Anthropic.
-    if [[ "$input" == *"{{phi:"* ]]; then
+    if [[ "$input" == *"{{phi:"* ]] || [[ "$input" == *"@@"* ]]; then
      input=$(preprocess_phi_markers "$input")
    fi

--- a/lib/hl7-sanitize.sh
+++ b/lib/hl7-sanitize.sh
@ -130,11 +130,89 @@ cmd_count() {
  echo $(($(wc -l < "$table") - 1))
 }

+# ─────────────────────────────────────────────────────────────────────────────
+# Auto-category detection + value canonicalization.
+# Canonicalization makes "SMITH^JOHN", "Smith, John", "John Smith", and
+# "JOHN SMITH" all collapse to the same lookup key so they share one token.
+# ─────────────────────────────────────────────────────────────────────────────
+
+# detect_category VALUE — guess category from value shape.
+# Returns one of: MRN | SSN | DOB | NAME | MANUAL
+detect_category() {
+  local v="$1"
+  # Trim surrounding whitespace
+  v="${v#"${v%%[![:space:]]*}"}"
+  v="${v%"${v##*[![:space:]]}"}"
+
+  # Pure digits, 4-15 chars → MRN
+  if [[ "$v" =~ ^[0-9]+$ ]] && [ "${#v}" -ge 4 ] && [ "${#v}" -le 15 ]; then
+    printf 'MRN'; return
+  fi
+  # SSN: 9 digits with optional dashes
+  if [[ "$v" =~ ^[0-9]{3}-?[0-9]{2}-?[0-9]{4}$ ]]; then
+    printf 'SSN'; return
+  fi
+  # Date-like
+  if [[ "$v" =~ ^[0-9]{1,4}[/-][0-9]{1,2}[/-][0-9]{1,4}$ ]]; then
+    printf 'DOB'; return
+  fi
+  # HL7 caret or comma → likely name
+  if [[ "$v" == *"^"* ]] || [[ "$v" == *","* ]]; then
+    printf 'NAME'; return
+  fi
+  # Two+ alpha tokens → name
+  local word_count
+  word_count=$(printf '%s\n' "$v" | awk '{ c=0; for(i=1;i<=NF;i++) if ($i ~ /^[A-Za-z]+$/) c++; print c }')
+  if [ "${word_count:-0}" -ge 2 ]; then
+    printf 'NAME'; return
+  fi
+  printf 'MANUAL'
+}
+
+# normalize_value VALUE CATEGORY — produce canonical lookup key.
+# NAME: lowercase, strip punctuation, sort unique alpha tokens. So
+#   "SMITH^JOHN", "Smith, John", "John Smith", "JOHN SMITH" → "john smith"
+# DOB: parse to YYYY-MM-DD if possible (needs GNU date or BSD date)
+# SSN: strip dashes/whitespace
+# MRN/MANUAL: just trim outer whitespace
+normalize_value() {
+  local value="$1"
+  local category="$2"
+  case "$category" in
+    NAME)
+      printf '%s' "$value" \
+        | tr '[:upper:]' '[:lower:]' \
+        | tr ',^/' '   ' \
+        | tr -cs '[:alnum:]' ' ' \
+        | tr ' ' '\n' \
+        | grep -E '^[a-z][a-z0-9]*$' \
+        | sort -u \
+        | tr '\n' ' ' \
+        | sed 's/ *$//'
+      ;;
+    DOB)
+      local parsed
+      parsed=$(date -d "$value" +%Y-%m-%d 2>/dev/null) \
+        || parsed=$(date -j -f "%m/%d/%Y" "$value" +%Y-%m-%d 2>/dev/null) \
+        || parsed=$(date -j -f "%Y-%m-%d" "$value" +%Y-%m-%d 2>/dev/null)
+      [ -n "$parsed" ] && printf '%s' "$parsed" || printf '%s' "$value" | tr -d '[:space:]'
+      ;;
+    SSN)
+      # Note: '-' must come at the END of the set so BSD tr doesn't treat
+      # it as an option flag. Same caveat applies for any future tr -d call.
+      printf '%s' "$value" | tr -d '[:space:]-'
+      ;;
+    *)
+      printf '%s' "$value" | awk '{$1=$1; print}'
+      ;;
+  esac
+}
+
 # tokenize-value: take a single value (and optional category) and return the
-# existing or newly-created token from the lookup table. Used by larry.sh to
-# preprocess {{phi:...}} placeholders BEFORE sending to the API.
-#   --category X   force category; default = search all categories for a match,
-#                  fall back to "MANUAL".
+# existing or newly-created token. Auto-detects category if not given. Uses
+# canonicalized form as the lookup key so name variants share one token.
+# Table schema (v0.5.5+): token \t category \t canonical \t original
+# Legacy 3-col rows (token \t category \t value) still readable.
 cmd_tokenize_value() {
  local table="$DEFAULT_TABLE"
  local category=""
@ -152,26 +230,26 @@ cmd_tokenize_value() {

  init_table "$table"

-  # Auto-category: search existing table for an exact value match.
+  # Auto-detect category from value shape if not explicitly forced.
  if [ -z "$category" ]; then
-    local existing
-    existing=$(awk -F'\t' -v v="$value" 'NR>1 && $3==v { print $1; exit }' "$table")
-    if [ -n "$existing" ]; then
-      printf '%s\n' "$existing"
-      return 0
-    fi
-    category="MANUAL"
+    category=$(detect_category "$value")
  fi

-  # Look up (category, value) → existing token
+  # Canonicalize so different surface forms of the same PHI share one token.
+  local canonical
+  canonical=$(normalize_value "$value" "$category")
+  [ -z "$canonical" ] && canonical="$value"
+
+  # Look up (category, canonical). Column 3 in both new (4-col) and legacy
+  # (3-col) rows is the lookup key, so this works for both.
  local tok
-  tok=$(awk -F'\t' -v cat="$category" -v v="$value" 'NR>1 && $2==cat && $3==v { print $1; exit }' "$table")
+  tok=$(awk -F'\t' -v cat="$category" -v c="$canonical" 'NR>1 && $2==cat && $3==c { print $1; exit }' "$table")
  if [ -n "$tok" ]; then
    printf '%s\n' "$tok"
    return 0
  fi

-  # Create new
+  # Create new — write 4 columns.
  local nextnum
  nextnum=$(awk -F'\t' -v cat="$category" '
    NR>1 && $2==cat && match($1, /_[0-9]+\]\]$/) {
@ -182,7 +260,7 @@ cmd_tokenize_value() {
  ' "$table")
  tok=$(printf '[[%s_%04d]]' "$category" "$nextnum")
  umask 077
-  printf '%s\t%s\t%s\n' "$tok" "$category" "$value" >> "$table"
+  printf '%s\t%s\t%s\t%s\n' "$tok" "$category" "$canonical" "$value" >> "$table"
  chmod 600 "$table" 2>/dev/null || true
  printf '%s\n' "$tok"
 }
@ -202,7 +280,9 @@ cmd_detokenize_value() {
  [ -n "$token" ] || die "detokenize-value needs a TOKEN"
  [ -f "$table" ] || die "no lookup table at $table"
  local val
-  val=$(awk -F'\t' -v t="$token" 'NR>1 && $1==t { print $3; exit }' "$table")
+  # Prefer column 4 (original surface form) for new rows; fall back to col 3
+  # for legacy 3-col rows where col 3 IS the original value.
+  val=$(awk -F'\t' -v t="$token" 'NR>1 && $1==t { print ($4 != "" ? $4 : $3); exit }' "$table")
  if [ -z "$val" ]; then
    printf 'no such token: %s\n' "$token" >&2
    return 2
@ -216,7 +296,7 @@ init_table() {
  chmod 700 "$(dirname "$table")" 2>/dev/null || true
  if [ ! -f "$table" ]; then
    umask 077
-    printf 'token\tcategory\toriginal\n' > "$table"
+    printf 'token\tcategory\tcanonical\toriginal\n' > "$table"
    chmod 600 "$table"
  fi
 }
 @ -1 +1 @@
 .5.4
 .5.5