diff --git a/VERSION b/VERSION index faef31a..39e898a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.7.0 +0.7.1 diff --git a/larry.sh b/larry.sh index b82c3ce..7a77e4d 100755 --- a/larry.sh +++ b/larry.sh @@ -47,7 +47,7 @@ set -o pipefail # ───────────────────────────────────────────────────────────────────────────── # Config # ───────────────────────────────────────────────────────────────────────────── -LARRY_VERSION="0.7.0" +LARRY_VERSION="0.7.1" LARRY_HOME="${LARRY_HOME:-$HOME/.larry}" LARRY_BASE_URL="${LARRY_BASE_URL:-https://raw.githubusercontent.com/bojj27/cloverleaf-larry/main}" LARRY_UPDATE_URL="${LARRY_UPDATE_URL:-${LARRY_BASE_URL}/larry.sh}" @@ -852,6 +852,369 @@ preprocess_phi_markers() { printf '%s' "$input" } +# ───────────────────────────────────────────────────────────────────────────── +# v0.7.1 — Automatic PHI detection +# +# Runs BEFORE preprocess_phi_markers (so explicit markers still take precedence) +# and BEFORE @file inline expansion has already been done (so file contents +# don't get token-walked here — they're tokenized by hl7_sanitize when needed). +# +# Strategy: walk every whitespace-delimited token and decide one of: +# * leave alone (path / URL / already-token / already-marker / timestamp) +# * tokenize via hl7-sanitize.sh tokenize-value (same pipeline as manual) +# +# Bryan's directive: err on the side of caution. We tokenize anything that +# *looks* like PHI as long as it doesn't interfere with required canonical +# matching. The same tokenize-value pipeline handles normalization, so +# different surface forms of the same value share one token across the session +# and across sanitized files. +# +# Modes (env LARRY_AUTO_PHI or /auto-phi slash): +# confirm (default) — prompt Y/n on first sighting of a name-like value +# aggressive — tokenize every match silently +# off — disable auto-detection entirely +# +# Per-turn override: prepend "!nophi " to skip auto-detection for that turn. +# ───────────────────────────────────────────────────────────────────────────── + +# Mode (default confirm). Promoted to AUTO_PHI_MODE so /auto-phi can mutate it. +AUTO_PHI_MODE="${LARRY_AUTO_PHI:-confirm}" + +# Per-session memory: declined values (user said "n" to confirm prompt) and +# accepted values (cached so we don't re-prompt). Keys are normalized canonical +# strings. For bash<4, fall back to two pipe-delimited strings. +if (( BASH_VERSINFO[0] >= 4 )); then + declare -A AUTO_PHI_ACCEPTED 2>/dev/null + declare -A AUTO_PHI_DECLINED 2>/dev/null +else + AUTO_PHI_ACCEPTED_LIST="" + AUTO_PHI_DECLINED_LIST="" +fi +AUTO_PHI_SESSION_COUNT=0 + +# Built-in allowlist of common non-PHI two-word phrases that match the loose +# "Title Case Title Case" name pattern. Lowercased + sorted on lookup. +# This is intentionally small — confirm-mode catches the rest interactively. +_AUTO_PHI_NAME_ALLOWLIST=$(cat <<'EOF' +home assistant +mac studio +mac mini +mac pro +mac book +apple watch +apple tv +new york +los angeles +san francisco +san diego +las vegas +united states +united kingdom +north america +south america +microsoft office +google cloud +amazon web +visual studio +sublime text +android studio +docker desktop +node red +linux mint +windows server +ubuntu server +debian linux +red hat +oracle linux +EOF +) + +_auto_phi_in_allowlist() { + local v_lower + v_lower=$(printf '%s' "$1" | tr '[:upper:]' '[:lower:]') + grep -Fxq -- "$v_lower" <<< "$_AUTO_PHI_NAME_ALLOWLIST" +} + +_auto_phi_seen_accepted() { + local key="$1" + if (( BASH_VERSINFO[0] >= 4 )); then + [ -n "${AUTO_PHI_ACCEPTED[$key]:-}" ] + else + [[ "|$AUTO_PHI_ACCEPTED_LIST|" == *"|$key|"* ]] + fi +} +_auto_phi_seen_declined() { + local key="$1" + if (( BASH_VERSINFO[0] >= 4 )); then + [ -n "${AUTO_PHI_DECLINED[$key]:-}" ] + else + [[ "|$AUTO_PHI_DECLINED_LIST|" == *"|$key|"* ]] + fi +} +_auto_phi_mark_accepted() { + local key="$1" + if (( BASH_VERSINFO[0] >= 4 )); then + AUTO_PHI_ACCEPTED[$key]=1 + else + AUTO_PHI_ACCEPTED_LIST="${AUTO_PHI_ACCEPTED_LIST}|$key" + fi +} +_auto_phi_mark_declined() { + local key="$1" + if (( BASH_VERSINFO[0] >= 4 )); then + AUTO_PHI_DECLINED[$key]=1 + else + AUTO_PHI_DECLINED_LIST="${AUTO_PHI_DECLINED_LIST}|$key" + fi +} + +# _auto_phi_classify VALUE → echoes a category (EMAIL/SSN/PHONE/DOB/MRN/NAME/NAME_LOOSE) +# or empty string if the value is not a tokenization candidate. +_auto_phi_classify() { + local v="$1" + [ -z "$v" ] && return 0 + + # Already-token format: [[CATEGORY_NNNN]] — leave alone. + [[ "$v" =~ ^\[\[[A-Z][A-Z0-9_]*_[0-9]+\]\]$ ]] && return 0 + + # Already-marker formats: @@VALUE@@, @@VALUE, {{phi:...}} — manual handles. + [[ "$v" == @@* ]] && return 0 + [[ "$v" == *@@ ]] && return 0 + [[ "$v" == \{\{phi:* ]] && return 0 + + # Path-like — leave alone. + case "$v" in + /*|./*|../*|~/*) return 0 ;; + [A-Z]:\\*) return 0 ;; + esac + + # URL-like — leave alone. + case "$v" in + http://*|https://*|ssh://*|ftp://*|sftp://*|file://*|ws://*|wss://*) return 0 ;; + esac + + # Strip a single trailing punctuation that is sentence-grammar, not part + # of the value. Re-evaluate the cleaned form. + local trimmed="$v" + case "$trimmed" in + *[.,\;:\!\?\)]) trimmed="${trimmed%?}" ;; + esac + + # Email-like. Must have exactly one @, dotted domain. + if [[ "$trimmed" =~ ^[^@[:space:]]+@[^@[:space:]]+\.[^@[:space:]]+$ ]]; then + printf 'EMAIL'; return + fi + + # Long-digit timestamp guard FIRST (before phone/SSN/MRN checks). Pure + # digits 13+ chars OR 10 chars starting with '1' (epoch seconds / millis). + # These would otherwise match the bare-phone or MRN patterns. Leave alone. + if [[ "$trimmed" =~ ^[0-9]+$ ]]; then + local n="${#trimmed}" + if [ "$n" -ge 13 ]; then return 0; fi + if [ "$n" -eq 10 ] && [[ "$trimmed" == 1* ]]; then return 0; fi + fi + + # SSN-like. 9 digits with optional dashes (must total exactly 9 digits). + if [[ "$trimmed" =~ ^[0-9]{3}-?[0-9]{2}-?[0-9]{4}$ ]]; then + local d="${trimmed//-/}" + [ "${#d}" -eq 9 ] && { printf 'SSN'; return; } + fi + + # Phone-like. The regex needs to match the FULL token, including a "(212)" + # prefix when the next token is "555-1234". We can't see across token + # boundaries here, so we accept the most-common single-token forms: + # 555-123-4567 5551234567 555.123.4567 + # 555 123 4567 (212)555-1234 (212)5551234 + # Multi-token "(212) 555-1234" is reconstructed by the two-token-PHONE + # pass below in auto_detect_phi (caller side). + if [[ "$trimmed" =~ ^\(?[0-9]{3}\)?[-\.\ ]?[0-9]{3}[-\.\ ]?[0-9]{4}$ ]]; then + # Distinguish from pure-digit MRN: 10-digit all-numeric reaches here + # too. If trimmed is 10 pure digits starting with '1' we already + # returned above (timestamp). Otherwise treat as PHONE. + printf 'PHONE'; return + fi + + # DOB / date-like. + if [[ "$trimmed" =~ ^[0-9]{1,4}[/-][0-9]{1,2}[/-][0-9]{1,4}$ ]]; then + printf 'DOB'; return + fi + + # MRN-like: pure digits, 6-12 chars (conservative — see spec rule #9). + if [[ "$trimmed" =~ ^[0-9]+$ ]]; then + local n2="${#trimmed}" + if [ "$n2" -ge 6 ] && [ "$n2" -le 12 ]; then + printf 'MRN'; return + fi + return 0 + fi + + # Name-like (HL7 carat). + if [[ "$trimmed" =~ ^[A-Za-z]+\^[A-Za-z]+ ]]; then + printf 'NAME'; return + fi + + # The loose "Title Case Title Case" pattern is handled across two whitespace + # tokens at the caller level — not classified per-token here. + return 0 +} + +# auto_detect_phi INPUT — main entrypoint. Echoes the rewritten input. +# Per-turn override: input starting with "!nophi " causes the function to +# strip the prefix and return without scanning. +auto_detect_phi() { + local input="$1" + local sanitize_script="$LARRY_LIB_DIR/hl7-sanitize.sh" + [ -x "$sanitize_script" ] || { printf '%s' "$input"; return; } + + # Per-turn override. + if [[ "$input" == '!nophi '* ]]; then + printf '%s' "${input#!nophi }" + return 0 + fi + if [ "$AUTO_PHI_MODE" = "off" ]; then + printf '%s' "$input" + return 0 + fi + + # Build a list of replacements (orig\tcategory\token) so we don't mutate + # the string mid-scan (which would invalidate offsets). + local -a hits=() + + # Pass A: per-whitespace-token classification. + local IFS=$' \t\n' tok + local -a tokens + read -r -a tokens <<< "$input" + local t cat key strip_trailing + for t in "${tokens[@]}"; do + [ -z "$t" ] && continue + # Also split comma-delimited sub-tokens (e.g. "a@b.com,c@d.com"). + local sub + for sub in ${t//,/ }; do + [ -z "$sub" ] && continue + cat=$(_auto_phi_classify "$sub") + [ -z "$cat" ] && continue + # Strip trailing sentence-grammar punct for the actual replace string, + # but only one char to match classify's behaviour. + strip_trailing="$sub" + case "$strip_trailing" in + *[.,\;:\!\?\)]) strip_trailing="${strip_trailing%?}" ;; + esac + hits+=("$strip_trailing|$cat") + done + done + + # Pass B: loose "Title Case Title Case" two-word names. Detect using a + # tolerant regex over the prose; per Bryan's confirm-first default, every + # hit goes through confirm unless mode=aggressive. + local i name_pair + for ((i=0; i<${#tokens[@]}-1; i++)); do + local left="${tokens[$i]}" right="${tokens[$i+1]}" + # Strip one trailing punct from right for the test. + local right_clean="$right" + case "$right_clean" in + *[.,\;:\!\?\)]) right_clean="${right_clean%?}" ;; + esac + if [[ "$left" =~ ^[A-Z][a-z]+$ ]] && [[ "$right_clean" =~ ^[A-Z][a-z]+$ ]]; then + name_pair="$left $right_clean" + # Allowlist check (case-insensitive). + if _auto_phi_in_allowlist "$name_pair"; then + continue + fi + hits+=("$name_pair|NAME_LOOSE") + fi + done + + # Pass C: two-token phone "(212) 555-1234" or "(212) 5551234" etc. The + # single-token classifier can't see across whitespace. + local phone_pair + for ((i=0; i<${#tokens[@]}-1; i++)); do + local p_left="${tokens[$i]}" p_right="${tokens[$i+1]}" + # Strip one trailing punct from p_right. + case "$p_right" in + *[.,\;:\!\?\)]) p_right="${p_right%?}" ;; + esac + if [[ "$p_left" =~ ^\(?[0-9]{3}\)?$ ]] \ + && [[ "$p_right" =~ ^[0-9]{3}[-\.]?[0-9]{4}$ ]]; then + phone_pair="$p_left $p_right" + hits+=("$phone_pair|PHONE") + fi + done + + # No hits — fast path. + [ ${#hits[@]} -eq 0 ] && { printf '%s' "$input"; return 0; } + + # Dedupe hits while preserving order. + local -A seen_hits=() + local -a uhits=() + local h + for h in "${hits[@]}"; do + if [ -z "${seen_hits[$h]:-}" ]; then + seen_hits[$h]=1 + uhits+=("$h") + fi + done + + # Apply each hit: confirm where needed, then tokenize + substitute. + local summary="" + local mode="$AUTO_PHI_MODE" + for h in "${uhits[@]}"; do + local orig="${h%|*}" + local cat="${h##*|}" + local actual_cat="$cat" + [ "$cat" = "NAME_LOOSE" ] && actual_cat="NAME" + + # Use canonical normalize for memory key (so "John Smith" / "JOHN SMITH" + # share one decision). + local mem_key + mem_key=$("$sanitize_script" normalize-value "$orig" "$actual_cat" 2>/dev/null) || mem_key="$orig" + [ -z "$mem_key" ] && mem_key="$orig" + + # User previously declined this value this session. + if _auto_phi_seen_declined "$mem_key"; then continue; fi + + # Confirm-first prompting only for NAME_LOOSE (the high-FP-rate detector). + # Strict-format hits (EMAIL/SSN/PHONE/DOB/MRN/NAME-with-caret) are always + # tokenized. This matches Bryan's "err on the side of caution" while + # keeping confirms rare and high-signal. + if [ "$cat" = "NAME_LOOSE" ] && [ "$mode" = "confirm" ] \ + && ! _auto_phi_seen_accepted "$mem_key"; then + local ans + printf '%sphi auto>%s possible PHI detected: "%s". Tokenize? [Y/n] ' \ + "$C_YELLOW" "$C_RESET" "$orig" >&2 + IFS= read -r ans /dev/null || ans="" + case "$ans" in + n|N|no|NO|No) _auto_phi_mark_declined "$mem_key"; continue ;; + *) _auto_phi_mark_accepted "$mem_key" ;; + esac + fi + + # Tokenize. + local token + token=$("$sanitize_script" tokenize-value --category "$actual_cat" "$orig" 2>/dev/null) + [ -z "$token" ] && continue + + # Substitute. Use literal string replacement (all occurrences). + input="${input//"$orig"/"$token"}" + + # Build summary line. + if [ -z "$summary" ]; then + summary="${orig}→${token}" + else + summary="${summary}, ${orig}→${token}" + fi + AUTO_PHI_SESSION_COUNT=$((AUTO_PHI_SESSION_COUNT + 1)) + done + + if [ -n "$summary" ]; then + local count + count=$(awk -F', ' '{print NF}' <<< "$summary") + printf '%sphi auto>%s tokenized %d value(s): %s\n' \ + "$C_YELLOW" "$C_RESET" "$count" "$summary" >&2 + fi + + printf '%s' "$input" +} + tool_hl7_sanitize() { local input_path="$1" strict="${2:-0}" _lib_err_if_missing || return @@ -2431,6 +2794,23 @@ Slash commands: all collapse to the same token. Category is auto-detected from value shape (MRN/SSN/DOB/NAME/MANUAL). {{phi:VALUE}} / {{phi:CAT:VALUE}} legacy syntax (still works) + + Automatic PHI detection (v0.7.1): + Larry now scans every prompt for PHI-shaped values and tokenizes them + BEFORE sending to Anthropic. Detects emails, SSNs, phones, dates, + MRNs (6-12 pure digits), HL7 caret-names, "Last, First" names, and + title-case "John Smith" patterns. Paths, URLs, timestamps, and a small + allowlist (Home Assistant, Mac Studio, etc.) are skipped. + + Modes (env LARRY_AUTO_PHI or /auto-phi): + confirm default — prompts Y/n on loose name-like matches once per + session; explicit-format hits (email/SSN/phone/etc.) are + always tokenized + aggressive tokenize every match silently + off disable auto-detection entirely (manual markers still work) + + Per-turn override: prefix any prompt with "!nophi " to skip the scan + for that turn only. Manual @@VALUE / {{phi:VALUE}} markers always win. /redetect re-scan for HCIROOT/HCISITE/tools /sites list site dirs under HCIROOT /site switch HCISITE for this session @@ -2453,13 +2833,23 @@ Multi-line input: are not matched. Binary files and files >250 KB are skipped/truncated with a warning. TAB after @ autocompletes against files in cwd (fzf if installed). -Status line (v0.6.9): - A dim 1-line summary prints above each you[...] > prompt: +Status line (v0.6.9, repositioned v0.7.1): + A dim 1-line summary now prints BELOW each just-completed turn (after the + Larry response, before the next you[...]> prompt) so it stays adjacent + to the conversation flow: OAuth: ─ ctx 12% (24K/200K) ─ 5h 1.8% reset 19:45 ─ 7d 73.7% reset Mon Jun 2 ─ API key: ─ ctx 12% (24K/200K) ─ $0.213 session ─ 14 turns ─ Disable entirely with LARRY_NO_STATUS=1. Force re-display with /status. Suppressed automatically on the first turn (no data yet). +Memory upload at session close (v0.7.1): + When LARRY_MEMORY_UPLOAD_URL is set, on clean exit Larry POSTs three + artifacts to the configured endpoint: $LARRY_HOME/log/headers.log + (header-log), $LARRY_HOME/sessions/.log.md (session-log), and + .messages.json (session-messages). Each request carries + X-Larry-Source, X-Larry-Version, and X-Session-Id headers. + Unset = silent skip with a one-line warn at exit. + TAB completion (v0.6.6/v0.6.7/v0.7.0): Type '/' followed by any prefix and press TAB. /h → /help @@ -2614,6 +3004,8 @@ _LARRY_SLASH_CMDS_DESC=( [/hl7]=" print full field list for an HL7 segment (e.g. /hl7 PID)" [/hl7-fields]=" print component breakdown (e.g. /hl7-fields PID.5)" [/mouse]="on|off toggle xterm mouse mode for this session" + [/auto-phi]="on|off|aggressive|confirm — runtime control for v0.7.1 auto PHI detection" + [/auto-phi-status]="show current auto-PHI mode + session tokenization count" ) # __larry_complete_slash — bound to TAB via `bind -x` (see _install_readline_tab). @@ -3100,8 +3492,83 @@ _uninstall_mouse_mode() { printf '\033[?1006l\033[?1000l' 2>/dev/null || true _LARRY_MOUSE_ACTIVE=0 } -# Ensure mouse mode is disabled on REPL exit (Ctrl-C, /quit, EOF). Idempotent. -trap '_uninstall_mouse_mode' EXIT INT TERM +# Ensure mouse mode is disabled on REPL exit (Ctrl-C, /quit, EOF). The trap +# itself is registered AFTER the v0.7.1 upload function below, so we can +# chain mouse-mode teardown after the memory upload in a single trap. + +# ───────────────────────────────────────────────────────────────────────────── +# v0.7.1 — session-artifact upload at session close. +# +# When LARRY_MEMORY_UPLOAD_URL is set, on clean exit we POST the headers.log, +# the session log.md, and the messages.json file to the configured endpoint. +# Each artifact goes as its own request with distinguishing headers so the +# ingest side can route appropriately. +# +# Bryan's memory pipeline (fswatch + ingest daemon) only sees files on his +# Mac; the WORK BOX (MobaXterm/Cygwin) where larry.sh runs is isolated, so +# we upload over the existing tailscale/network path. +# +# Safety: +# - headers.log filters to ^anthropic-* / ^retry-after: response headers +# only — request auth headers (Authorization / x-api-key) are NEVER +# captured into the log at write time (see _parse_response_headers). +# - session log.md contains conversation content. By design Bryan uses +# PHI markers / auto-PHI, so PHI is already tokenized before reaching +# the log. Auth tokens never enter the conversation stream. +# - messages.json contains the same token-substituted conversation +# content as the log. +# +# Set LARRY_MEMORY_UPLOAD_URL= (e.g. on proxy.bjnoela.com) to +# enable. Unset = silent skip with a one-line warn at session close. +# ───────────────────────────────────────────────────────────────────────────── +_LARRY_UPLOAD_FIRED=0 +upload_session_artifacts() { + # Run once per session (in case both clean exit and EXIT trap fire). + [ "$_LARRY_UPLOAD_FIRED" = "1" ] && return 0 + _LARRY_UPLOAD_FIRED=1 + + local url="${LARRY_MEMORY_UPLOAD_URL:-}" + if [ -z "$url" ]; then + warn "(memory upload skipped: LARRY_MEMORY_UPLOAD_URL not configured)" + return 0 + fi + command -v curl >/dev/null 2>&1 || { warn "(memory upload skipped: curl missing)"; return 0; } + + local artifacts=( + "$LARRY_HOME/log/headers.log|headers-log|text/plain" + "$LOG_FILE|session-log|text/markdown" + "$MESSAGES_FILE|session-messages|application/json" + ) + local entry path kind ctype http_code uploaded=0 + for entry in "${artifacts[@]}"; do + path="${entry%%|*}" + kind="${entry#*|}"; kind="${kind%%|*}" + ctype="${entry##*|}" + [ -f "$path" ] || continue + [ -s "$path" ] || continue + http_code=$(curl -fsS --max-time 15 \ + -o /dev/null -w '%{http_code}' \ + -X POST "$url" \ + -H "Content-Type: $ctype" \ + -H "X-Larry-Source: $kind" \ + -H "X-Larry-Version: $LARRY_VERSION" \ + -H "X-Session-Id: $SESSION_ID" \ + --data-binary "@$path" 2>/dev/null) || http_code="000" + if [ "$http_code" = "200" ] || [ "$http_code" = "201" ] || [ "$http_code" = "202" ] || [ "$http_code" = "204" ]; then + uploaded=$((uploaded + 1)) + else + warn "(memory upload: $kind → HTTP $http_code)" + fi + done + if [ "$uploaded" -gt 0 ]; then + larry_say "memory upload: posted $uploaded artifact(s) to $url" + fi +} + +# Fire upload on EXIT trap too (covers Ctrl-C / EOF / kill). The function +# is idempotent (_LARRY_UPLOAD_FIRED guard) so the clean-exit call from +# main_loop won't double-post. +trap 'upload_session_artifacts || true; _uninstall_mouse_mode' EXIT INT TERM read_user_input() { # Returns user input via global LARRY_INPUT. @@ -3242,10 +3709,14 @@ main_loop() { while true; do local _short; _short=$(model_short_name) - # v0.6.9: persistent status line above the prompt. - # Only on the FIRST line of input — heredoc continuation reads in - # read_user_input do not invoke this loop iteration. - render_status_line + # v0.7.1: status line is rendered AFTER the previous agent_turn (see end + # of loop), so it sits BELOW the just-completed prompt cycle / agent + # response and ABOVE the next prompt. Net visual effect: status reads as + # a footer to the most-recent turn. This is "Option B" from the v0.7.1 + # spec — chosen over cursor-manipulation Option A because `read -e` + # (readline) takes exclusive control of the cursor and inserting a + # repositioned footer below an active prompt is fragile on MobaXterm / + # Cygwin (readline redisplay clobbers manual cursor moves). printf '%syou[%s]>%s ' "$C_GREEN" "$_short" "$C_RESET" if ! read_user_input; then echo ""; break @@ -3387,6 +3858,33 @@ main_loop() { ;; esac continue ;; + # v0.7.1: auto-PHI runtime control. + /auto-phi|/auto-phi\ *) + local _arg; _arg=$(_slash_args "/auto-phi" "$input") + case "${_arg:-status}" in + on|confirm) + AUTO_PHI_MODE="confirm" + larry_say "auto-phi: confirm (default — prompts on loose name-like matches)" + ;; + aggressive) + AUTO_PHI_MODE="aggressive" + larry_say "auto-phi: aggressive (tokenizes all candidates silently)" + ;; + off) + AUTO_PHI_MODE="off" + larry_say "auto-phi: off (explicit markers @@VALUE / {{phi:VALUE}} still work)" + ;; + status) + larry_say "auto-phi mode: $AUTO_PHI_MODE (tokenized this session: $AUTO_PHI_SESSION_COUNT)" + ;; + *) + err "usage: /auto-phi on|off|aggressive|confirm (no arg → status)" + ;; + esac + continue ;; + /auto-phi-status) + larry_say "auto-phi mode: $AUTO_PHI_MODE (tokenized this session: $AUTO_PHI_SESSION_COUNT)" + continue ;; /show-last-tool) if [ -z "$_LARRY_LAST_TOOL_NAME" ]; then err "no tool calls yet this session" @@ -3608,6 +4106,11 @@ EOF ;; esac + # v0.7.1: auto-PHI detection runs BEFORE explicit markers, but the function + # itself defers to existing markers (it leaves anything inside @@...@@ or + # {{phi:...}} alone). Manual markers still win. + input=$(auto_detect_phi "$input") + # PHI preprocessing: replace any {{phi:VALUE}} markers with local tokens # BEFORE the input enters conversation history and gets sent to Anthropic. if [[ "$input" == *"{{phi:"* ]] || [[ "$input" == *"@@"* ]]; then @@ -3618,10 +4121,14 @@ EOF add_user_text "$input" agent_turn "$system_prompt" || warn "turn ended with error" echo "" + # v0.7.1: status line below the just-completed prompt cycle. Lives between + # turns, immediately above the next prompt. /status forces a re-render. + render_status_line done log_section "session-end" log_append "- end: $(date -Iseconds 2>/dev/null || date)" + upload_session_artifacts || true larry_say "session log: $LOG_FILE" } diff --git a/lib/hl7-sanitize.sh b/lib/hl7-sanitize.sh index 347b095..a10a4c4 100755 --- a/lib/hl7-sanitize.sh +++ b/lib/hl7-sanitize.sh @@ -202,6 +202,17 @@ normalize_value() { # it as an option flag. Same caveat applies for any future tr -d call. printf '%s' "$value" | tr -d '[:space:]-' ;; + PHONE) + # Strip all non-digits so "(555) 123-4567" and "5551234567" share one + # token. Keep digits only. + printf '%s' "$value" | tr -cd '[:digit:]' + ;; + EMAIL) + # Lowercase + trim. Emails are case-insensitive in the local part per + # most providers' practice (RFC technically allows local-part case + # sensitivity, but tokenizing as one value is fine for PHI). + printf '%s' "$value" | tr '[:upper:]' '[:lower:]' | awk '{$1=$1; print}' + ;; *) printf '%s' "$value" | awk '{$1=$1; print}' ;; @@ -437,6 +448,17 @@ case "$SUB" in count) shift; cmd_count "$@" ;; tokenize-value) shift; cmd_tokenize_value "$@" ;; detokenize-value) shift; cmd_detokenize_value "$@" ;; + normalize-value) + # normalize-value VALUE [CATEGORY] — emit canonical form without + # tokenizing or touching the table. Used by larry.sh's auto-PHI to + # build per-session memory keys. + shift + nv_val="${1:-}"; nv_cat="${2:-}" + [ -n "$nv_val" ] || die "normalize-value needs a VALUE" + [ -n "$nv_cat" ] || nv_cat=$(detect_category "$nv_val") + normalize_value "$nv_val" "$nv_cat" + printf '\n' + ;; -h|--help) sed -n '2,30p' "$NC_SELF"; exit 0 ;; *) # Default = sanitize mode