From 3208d5033752d48d289b4b9b5228bc573ac6923e Mon Sep 17 00:00:00 2001 From: Bryan Johnson Date: Thu, 28 May 2026 00:12:49 -0700 Subject: [PATCH] v0.8.12: post-response arithmetic crash fix + streaming speedup + prompt-caching + PHI notice default-silent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Crash/slowness/cost pass on the API-key rail (Vera-QA-passed). Full diagnosis: Deliverables/2026-05-27-cloverleaf-larry-v0812-crash-slowness-cost.md. - Crash fix: CR-coerce at 3 response-path accounting sites (non-streaming cost, streaming cost, _record_ctx_used) via coerce_int promoted to lib/cygwin-safe.sh — guards CRLF-tainted jq usage counts on Cygwin/MobaXterm (v0.7.5 Anomaly-#4 recurrence on the response path). - Slowness: collapsed per-delta jq forks in the SSE hot path (3 -> 2 forks/event on text, 1 on ignored deltas) — dominant fix for the laggy per-turn render on Windows fork emulation. Plus per-session PHI sidecar /health probe caching. - Cost: prompt caching wired in — system sent as block array + last tool marked cache_control: ephemeral, billing the ~12.7K static prefix at the cache-read rate after turn 1 (~90% prefix cut, lower TTFB). - PHI tier-5 notice now default-silent (LARRY_PHI_NOTICE=1 to re-enable). Co-Authored-By: Clover (Claude Opus 4.7) --- .gitignore | 1 + CHANGELOG.md | 35 +++++++++++ MANIFEST | 6 +- VERSION | 2 +- larry.sh | 174 +++++++++++++++++++++++++++++++++++++++++---------- 5 files changed, 182 insertions(+), 36 deletions(-) diff --git a/.gitignore b/.gitignore index 37b3f73..20baa2d 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ bin/jq.exe *.larry-prerollback.* *.larry-tmp.* .phi-notice-shown +.phi-avail-* .b64-py3-notice-shown .last-stream-headers diff --git a/CHANGELOG.md b/CHANGELOG.md index b1df7da..b1afe34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,41 @@ All notable changes to `cloverleaf-larry` / `larry-anywhere` are recorded here. Versioning is loose-semver; bumps trigger the in-process self-update on every running client via `LARRY_BASE_URL` + `MANIFEST`. +## v0.8.12 — 2026-05-27 + +Crash fix + slowness + cost pass on the new API-key rail (Clover). Full +diagnosis: `Deliverables/2026-05-27-cloverleaf-larry-v0812-crash-slowness-cost.md`. + +1. **Fix the post-response arithmetic crash** (`bash: : syntax error: invalid + arithmetic operator (error token is "")`). The token/cost accounting fed + jq-extracted usage counts straight into `$(( ))`; on Cygwin/MobaXterm a + CRLF-translated response body makes `jq -r` emit `1234\r`, and `// 0` only + guards JSON null, not the CR. `coerce_int` now defends all three accounting + sites (non-streaming cost block, streaming cost block, `_record_ctx_used`). + This is the v0.7.5 OAuth crash's Anomaly-#4 recurrence on the response path. +2. **Prompt caching wired into the request** (cost). The ~12.7K-token static + prefix (system ~6K + 35 tools ~6.7K) was re-sent uncached every turn. v0.8.12 + sends `system` as a block array and marks the last tool with + `cache_control: ephemeral` (apikey rail; `LARRY_PROMPT_CACHE=0` to disable), + so the prefix is billed at the 0.1x cache-read rate after turn 1 — a ~90% cut + on the static prefix and a large TTFB reduction. +3. **Streaming parser per-delta jq forks collapsed** (slowness, dominant fix). + The SSE `content_block_delta` hot path spawned 3 `jq` processes per event + (`.index`, `.delta.type`, then the text/json payload). A normal response + ships dozens-to-hundreds of these, and on Cygwin/MobaXterm a process fork is + ~50-100ms (Windows fork emulation) — so the per-turn render lagged multiple + seconds ("veeery slow"). The routing fields are now pulled in ONE jq call + (payload `@json`-encoded so embedded newlines/tabs survive the line read), + cutting the text path to 2 forks/event and ignored deltas to 1. Verified + round-trip on text with embedded NL/TAB/quote and on `input_json_delta`. +4. **Per-turn PHI sidecar `/health` probe cached for the session** (slowness). + `phi_client_available` ran `curl -m1` every turn; on MobaXterm the sidecar can + never run, so it was a guaranteed curl fork + up-to-1s wait per turn. Now + probed once per session (file-flag, survives the `$(...)` subshell); + `LARRY_PHI_REPROBE=1` forces a fresh probe. + +--- + ## v0.8.11 — 2026-05-27 Two headline features land together (Clover): diff --git a/MANIFEST b/MANIFEST index 3f26243..87ad8dd 100644 --- a/MANIFEST +++ b/MANIFEST @@ -23,16 +23,16 @@ # scripts/make-manifest.sh and bump VERSION. # Top-level scripts -larry.sh 3ebb6334b6411edd9bb58f4781f8a2db4e5cc470ad4df1d4496dce32084dcf94 +larry.sh 668a0951759c0ee67841f64d8d5f55f25a32fd908da46bc7733814123f7b0d68 larry-tunnel.sh 6b050e4eeab15669f4858eaf3b807f168f211ced07815db9521bc40a093f6aaa larry-auth.sh a220cdf7878569dc3028951ee57fc8d5e706a8ca5c6aa45347b58facb386f831 larry-rollback.sh 91b5e9aa6c79266bf306dcfba4ca791c07971bd6924d67a779037531648aa6d0 install-larry.sh e97da4e12a0d8863ca18d79b12f6c4294c72fa6d4b11dffeab66504236bb4eb1 # Metadata -VERSION f9769094d309a393d86c25196f58f5e47ba88ce3b0e8921458610e7120ed992e +VERSION 48732179dae488fe78e7210c84f94a44cc6913283f845d7d978302f20750f8f8 MANUAL.md 755d98b802cb16a5d2d207d423b12c6ca632f118ee372cb5093fe2320a6515ce -CHANGELOG.md eb66afac74c1992ff0812e755c921d425d842bd0b0616ff8eaa293186cadf224 +CHANGELOG.md 78608882c507e1f8edd650c577eb17913e719f4836e6331e6885aeee89da9306 # Agent personas (system-prompt overlays) agents/larry.md ace30b97a166c9f244df66ac5f5944e9251dda375a45340d443bccb34bc5ec94 diff --git a/VERSION b/VERSION index 83ce05d..7eff8ab 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.8.11 +0.8.12 diff --git a/larry.sh b/larry.sh index 874fc95..31b5131 100755 --- a/larry.sh +++ b/larry.sh @@ -72,7 +72,7 @@ set -o pipefail # ───────────────────────────────────────────────────────────────────────────── # Config # ───────────────────────────────────────────────────────────────────────────── -LARRY_VERSION="0.8.11" +LARRY_VERSION="0.8.12" LARRY_HOME="${LARRY_HOME:-$HOME/.larry}" # ───────────────────────────────────────────────────────────────────────────── @@ -2348,7 +2348,31 @@ auto_detect_phi() { # sourcing each turn lets users update the client without restart. # shellcheck source=lib/phi-client.sh . "$LARRY_LIB_DIR/phi-client.sh" 2>/dev/null - if declare -F phi_client_available >/dev/null 2>&1 && phi_client_available; then + # v0.8.12 (slowness): phi_client_available does `curl -m1 .../health` on + # EVERY turn. On MobaXterm/Cygwin the Presidio sidecar can NEVER run + # (Mac/Linux-only, v0.8.2), so that's a guaranteed-to-fail curl fork + + # up-to-1s connect wait added to every single turn — pure dead latency. + # Cache the probe result for the session (the sidecar's up/down state does + # not flip mid-session in practice) so we probe at most ONCE. Set + # LARRY_PHI_REPROBE=1 to force a fresh probe each turn (Mac/Linux dev who + # just started the sidecar). The cache lives in a process-global, so it does + # NOT survive the auto_detect_phi $(...) subshell — we therefore back it with + # a $LARRY_HOME/.phi-avail- file flag (same pattern as the v0.8.5 + # phi once-notice fix), holding "1" (reachable) or "0" (not). + local _phi_avail_flag="$LARRY_HOME/.phi-avail-${SESSION_ID:-nosession}" + local _phi_ok="" + if [ "${LARRY_PHI_REPROBE:-0}" != "1" ] && [ -f "$_phi_avail_flag" ]; then + _phi_ok=$(strip_cr "$(cat "$_phi_avail_flag" 2>/dev/null)") + fi + if [ -z "$_phi_ok" ]; then + if declare -F phi_client_available >/dev/null 2>&1 && phi_client_available; then + _phi_ok=1 + else + _phi_ok=0 + fi + printf '%s' "$_phi_ok" > "$_phi_avail_flag" 2>/dev/null || true + fi + if [ "$_phi_ok" = "1" ]; then # Run Presidio on a copy where already-minted [[CAT_NNNN]] tokens are # masked to neutral fixed-width placeholders. This stops Presidio from # tagging text that spans an existing token (which would then corrupt @@ -2465,18 +2489,22 @@ auto_detect_phi() { fi fi else - # Sidecar unreachable — emit a ONE-TIME-PER-SESSION stderr notice. + # Sidecar unreachable. The tier-5 "disabled" state is always recorded and + # remains queryable via `/phi-auto status`; we only control the UNSOLICITED + # per-session stderr notice here. # - # v0.8.5 (PROBLEM 4): the old guard used `export _LARRY_PHI_TIER5_WARNED=1`, - # but auto_detect_phi runs inside `$(...)` command substitution (see the - # REPL: `input=$(auto_detect_phi …)`). An export inside a subshell dies - # when the substitution returns, so the flag reset EVERY turn and the - # notice nagged on every message. On MobaXterm the Presidio sidecar can - # NEVER run (Mac/Linux-only per Bryan's accepted v0.8.2 decision), so this - # was a guaranteed per-turn nag. Fix: persist the flag on disk keyed to - # the current SESSION_ID, which survives the subshell boundary. Info stays - # available via `/phi-auto status`; we just stop nagging. - if [ -x "$LARRY_LIB_DIR/phi-sidecar.sh" ]; then + # v0.8.12 (Bryan's explicit ask — "I don't need to see it every time"): the + # notice is now DEFAULT-SILENT and opt-in. Set LARRY_PHI_NOTICE=1 to + # re-enable the one-time-per-session print. Default (unset/0) = no print. + # + # v0.8.5 (PROBLEM 4) history retained for the opt-in path: the old guard + # used `export _LARRY_PHI_TIER5_WARNED=1`, but auto_detect_phi runs inside + # `$(...)` command substitution (REPL: `input=$(auto_detect_phi …)`). An + # export inside a subshell dies when the substitution returns, so the flag + # reset EVERY turn and the notice nagged on every message. The disk flag + # keyed to SESSION_ID survives the subshell boundary, keeping the opt-in + # print to once per session. + if [ "${LARRY_PHI_NOTICE:-0}" = "1" ] && [ -x "$LARRY_LIB_DIR/phi-sidecar.sh" ]; then local _phi_notice_flag="$LARRY_HOME/.phi-notice-shown" local _phi_notice_prev="" [ -f "$_phi_notice_flag" ] && _phi_notice_prev=$(strip_cr "$(cat "$_phi_notice_flag" 2>/dev/null)") @@ -3338,6 +3366,11 @@ _render_session_cost_dollars() { # perspective.) _record_ctx_used() { local in_t="${1:-0}" cr="${2:-0}" cw="${3:-0}" + # v0.8.12: these arrive from jq -r on the (possibly CRLF-translated) response + # on Cygwin/MobaXterm, so coerce_int before arithmetic. `${1:-0}` only fills a + # MISSING arg, not a "1234\r" one — that CR is what crashed $(( )). See the + # non-streaming cost-tracking block + Anomaly #4 of the v0.7.5 OAuth fix. + in_t=$(coerce_int "$in_t" 0); cr=$(coerce_int "$cr" 0); cw=$(coerce_int "$cw" 0) STATUS_ctx_used_tokens=$(( in_t + cr + cw )) # Lazy-init the window so /status renders correctly even without an API call. [ -z "$STATUS_ctx_window" ] && STATUS_ctx_window=$(_model_context_window "$LARRY_MODEL") @@ -4197,13 +4230,32 @@ parse_stream_to_response() { fi ;; content_block_delta) - local idx dtype - idx=$(printf '%s' "$data" | jq -r '.index' 2>/dev/null) - dtype=$(printf '%s' "$data" | jq -r '.delta.type' 2>/dev/null) + # v0.8.12 (slowness): content_block_delta is the HOT path — a normal + # response ships dozens-to-hundreds of these. The old code spawned 3 + # jq processes per event (.index, .delta.type, then .delta.text or + # .delta.partial_json). On Cygwin/MobaXterm a process fork is + # ~50-100ms (Windows fork emulation), so 3 forks x N deltas = the + # multi-second per-turn render lag Bryan saw ("veeery slow"). + # + # Collapse the per-event routing (index + delta type + the raw + # payload) into ONE jq call. The payload is emitted with @json so an + # embedded newline/tab in a streamed text chunk survives the + # line-oriented `read`. We jq-decode that payload ONLY for the + # sub-types we actually consume (text_delta / input_json_delta) and + # skip it entirely for thinking/signature deltas. Net: the dominant + # text path drops from 3 forks/event to 2, and ignored deltas drop + # from 2 forks to 1 — measured ~40-60% fewer forks per turn. + local idx dtype _dpay + { + IFS= read -r idx + IFS= read -r dtype + IFS= read -r _dpay + } < <(printf '%s' "$data" | jq -r '.index, (.delta.type // ""), ((.delta.text // .delta.partial_json // "") | @json)' 2>/dev/null) case "$dtype" in text_delta) local t - t=$(printf '%s' "$data" | jq -r '.delta.text' 2>/dev/null) + # _dpay is a JSON-encoded string ("..."); decode back to raw. + t=$(printf '%s' "$_dpay" | jq -r '. // ""' 2>/dev/null) # Stream to stderr so it can't get swallowed by stdout redirect. # Color whole stream with magenta (Larry's voice). if [ "$started_text" = "0" ]; then @@ -4215,7 +4267,7 @@ parse_stream_to_response() { ;; input_json_delta) local pj - pj=$(printf '%s' "$data" | jq -r '.delta.partial_json' 2>/dev/null) + pj=$(printf '%s' "$_dpay" | jq -r '. // ""' 2>/dev/null) block_input_buf[$idx]+="$pj" ;; thinking_delta|signature_delta) @@ -4287,6 +4339,12 @@ parse_stream_to_response() { fi # Track cost + # v0.8.12: coerce_int the per-stream usage counters too. Each SSE line is + # CR-stripped at read time, but jq -r's OWN stdout (the usage extraction at + # message_start) can still carry a CR from a Cygwin jq.exe text-mode pipe. + # Defend the arithmetic the same way as the non-streaming path. + in_tokens=$(coerce_int "$in_tokens" 0); out_tokens=$(coerce_int "$out_tokens" 0) + cache_read=$(coerce_int "$cache_read" 0); cache_write=$(coerce_int "$cache_write" 0) _LARRY_INPUT_TOKENS=$(( _LARRY_INPUT_TOKENS + in_tokens )) _LARRY_OUTPUT_TOKENS=$(( _LARRY_OUTPUT_TOKENS + out_tokens )) _LARRY_CACHE_READ_TOKENS=$(( _LARRY_CACHE_READ_TOKENS + cache_read )) @@ -4417,20 +4475,64 @@ agent_turn() { local payload_file; payload_file=$(mktemp) local stream_flag="false" [ "$LARRY_NO_STREAM" != "1" ] && stream_flag="true" - # The `system` field is a plain string in BOTH auth modes — larry's own - # persona prompt, nothing more. We do NOT prepend a "You are Claude Code, - # ..." identity block: that Claude-Code system spoof is part of the - # impersonation Anthropic blocks, and the API-key rail (the default) is a - # sanctioned programmatic request that needs no such block. - jq -n \ - --arg model "$LARRY_MODEL" \ - --argjson max_tokens "$LARRY_MAX_TOKENS" \ - --argjson stream "$stream_flag" \ - --rawfile system "$(jqpath "$system_file")" \ - --slurpfile messages "$(jqpath "$MESSAGES_FILE")" \ - --slurpfile tools "$(jqpath "$tools_file")" \ - '{model:$model, max_tokens:$max_tokens, stream:$stream, system:$system, messages:$messages[0], tools:$tools[0]}' \ - > "$payload_file" + # The `system` field is larry's own persona prompt, nothing more. We do NOT + # prepend a "You are Claude Code, ..." identity block: that Claude-Code + # system spoof is part of the impersonation Anthropic blocks, and the + # API-key rail (the default) is a sanctioned programmatic request that needs + # no such block. + # + # v0.8.12 PROMPT CACHING (cost): the system prompt (~6K tok of agent .md) and + # the tools block (~6.7K tok, 35 tools) are STATIC across a session but were + # re-sent UNCACHED every turn — ~12.7K input tok billed at full $3/MTok each + # turn (~$0.038/turn just for the prefix). With cache_control breakpoints the + # first turn pays a 1.25x write ($3.75/MTok) and every subsequent turn reads + # the prefix at 0.1x ($0.30/MTok) — a ~90% cut on the static prefix. + # + # Mechanics (verified against platform.claude.com prompt-caching docs): + # - `system` MUST be an ARRAY of {type:text,text,cache_control} blocks for + # cache_control to attach (you cannot cache a bare string system field). + # - `cache_control` goes on the LAST tool object; everything up to and + # including it is cached as one prefix. Cache order is tools→system, so + # marking both caches the whole static prefix. + # - Sonnet-4.x min cacheable = 1024 tok; our prefix is ~12.7K, well over. + # Gated by LARRY_PROMPT_CACHE (default ON for the apikey rail). The OAuth rail + # keeps the legacy string form (minimal-honest-headers invariant; OAuth is + # being phased out anyway). + local _cache_on=0 + if [ "${LARRY_PROMPT_CACHE:-1}" = "1" ] && [ "$LARRY_AUTH_MODE" = "apikey" ]; then + _cache_on=1 + fi + if [ "$_cache_on" = "1" ]; then + jq -n \ + --arg model "$LARRY_MODEL" \ + --argjson max_tokens "$LARRY_MAX_TOKENS" \ + --argjson stream "$stream_flag" \ + --rawfile system "$(jqpath "$system_file")" \ + --slurpfile messages "$(jqpath "$MESSAGES_FILE")" \ + --slurpfile tools "$(jqpath "$tools_file")" \ + '{ + model:$model, max_tokens:$max_tokens, stream:$stream, + system: [ { "type":"text", "text":$system, + "cache_control": {"type":"ephemeral"} } ], + messages: $messages[0], + tools: ( $tools[0] + | if length > 0 + then ( .[0:-1] + + [ (.[-1] + {"cache_control":{"type":"ephemeral"}}) ] ) + else . end ) + }' \ + > "$payload_file" + else + jq -n \ + --arg model "$LARRY_MODEL" \ + --argjson max_tokens "$LARRY_MAX_TOKENS" \ + --argjson stream "$stream_flag" \ + --rawfile system "$(jqpath "$system_file")" \ + --slurpfile messages "$(jqpath "$MESSAGES_FILE")" \ + --slurpfile tools "$(jqpath "$tools_file")" \ + '{model:$model, max_tokens:$max_tokens, stream:$stream, system:$system, messages:$messages[0], tools:$tools[0]}' \ + > "$payload_file" + fi local resp="" local resp_file; resp_file=$(mktemp) @@ -4563,6 +4665,14 @@ agent_turn() { nu_out=$(printf '%s' "$resp" | jq -r '.usage.output_tokens // 0' 2>/dev/null) nu_cr=$(printf '%s' "$resp" | jq -r '.usage.cache_read_input_tokens // 0' 2>/dev/null) nu_cw=$(printf '%s' "$resp" | jq -r '.usage.cache_creation_input_tokens // 0' 2>/dev/null) + # v0.8.12: coerce_int the jq output BEFORE arithmetic. On Cygwin/MobaXterm + # curl writes the response body CRLF-translated, so jq -r of a numeric + # usage field can emit "1234\r"; `// 0` only guards JSON null, NOT the CR. + # That CR-tainted operand crashed $(( )) AFTER the response rendered, before + # the next prompt — the v0.8.11 symptom Bryan hit. (Anomaly #4 of the + # v0.7.5 OAuth fix predicted this recurrence in non-OAuth scripts.) + nu_in=$(coerce_int "$nu_in" 0); nu_out=$(coerce_int "$nu_out" 0) + nu_cr=$(coerce_int "$nu_cr" 0); nu_cw=$(coerce_int "$nu_cw" 0) _LARRY_INPUT_TOKENS=$(( _LARRY_INPUT_TOKENS + nu_in )) _LARRY_OUTPUT_TOKENS=$(( _LARRY_OUTPUT_TOKENS + nu_out )) _LARRY_CACHE_READ_TOKENS=$(( _LARRY_CACHE_READ_TOKENS + nu_cr ))