From 4f1ea86051ab805e44c9938341f7aee18ba02182 Mon Sep 17 00:00:00 2001 From: Bryan Johnson Date: Wed, 27 May 2026 20:54:10 -0700 Subject: [PATCH] v0.8.5: rate-limit backoff + actionable message, streaming single-send, ErrorPI CR fix, phi once-notice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Diagnose-don't-assume rate-limit cluster (Clover #8). The rate_limit_error on a work-box with 90% of the 5h Max quota free was a short-window BURST rail, not 5h exhaustion — tripped by a stream->non-stream double-send per turn with no backoff. - Rate-limit backoff honoring retry-after (else exp 2/4/8 cap 30) + actionable header-parsed message naming the tripped rail; headers.log now captures every 429 (was OAuth+unified-* only), tagged with retry-after + rail. - parse_stream_to_response detects a non-SSE JSON error body (429/overload) and returns a distinct code so agent_turn surfaces it WITH backoff instead of re-sending the whole prompt (single-send invariant). Auto LARRY_NO_STREAM=1 on MobaXterm/Cygwin/MSYS; explicit LARRY_NO_STREAM=0 still forces streaming on. - ErrorPI fix: strip_cr on err_type/err_msg in _humanize_api_error (a trailing CR broke the case match AND carriage-overprinted "API error"); err/warn/log now strip embedded CRs defensively. (v0.7.5 sweep missed the error-display path.) - phi tier-5 notice once-per-session via $LARRY_HOME/.phi-notice-shown SESSION_ID flag (old export flag died in the $(...) subshell -> per-turn nag). Same-pattern sweep fixed the identical subshell-flag bug in _auto_phi_b64_roundtrip. Deliverable: Deliverables/2026-05-27-cloverleaf-larry-v085-ratelimit-streaming-fixes.md Co-Authored-By: Clover (Claude Opus 4.7) --- .gitignore | 3 + CHANGELOG.md | 46 +++++++ VERSION | 2 +- larry.sh | 346 +++++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 369 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index d234dcb..0bca839 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ bin/jq bin/jq.exe *.larry-prerollback.* *.larry-tmp.* +.phi-notice-shown +.b64-py3-notice-shown +.last-stream-headers # Editor / OS noise .DS_Store diff --git a/CHANGELOG.md b/CHANGELOG.md index b7da1c7..a01acc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,52 @@ All notable changes to `cloverleaf-larry` / `larry-anywhere` are recorded here. Versioning is loose-semver; bumps trigger the in-process self-update on every running client via `LARRY_BASE_URL` + `MANIFEST`. +## v0.8.5 — 2026-05-27 + +Diagnose-don't-assume rate-limit cluster fix (Clover #8). Symptom: a `hello` +turn threw `rate_limit_error` on a work-box with 90% of the Claude Max 5h quota +free — so NOT 5h-window exhaustion. Root cause = a short-window BURST rail +tripped by a stream→non-stream **double-send** per turn, with no backoff. + +- **Rate-limit backoff + actionable message (ROOT).** A 429 no longer fails the + turn or fires an immediate re-send. `agent_turn` now retries with backoff that + HONORS the `retry-after` header (else exponential 2/4/8s capped at 30s; + `LARRY_RL_MAX_RETRIES`/`LARRY_RL_BACKOFF_MAX` tunable). The error message is now + ACTIONABLE: `_parse_response_headers` captures `retry-after` + which rail + tripped (`anthropic-ratelimit-{requests,input-tokens,output-tokens}-remaining:0` + or `unified-{5h,7d}` for OAuth) and `_humanize_rate_limit` renders e.g. + `rate limit: requests-per-minute exhausted (short-window burst, NOT your 5h + quota) — resets in 38s; retrying with backoff`. `headers.log` now captures the + full header block on ANY 429 (was: OAuth-mode + unified-* header only), tagged + `*** 429 retry-after=Ns rail=… ***`, so the next rate-limit is always + diagnosable. +- **Streaming parse failure no longer double-sends (burst trigger).** A + streaming 429/overload returns a plain JSON error body (not SSE); + `parse_stream_to_response` previously dropped those non-`data:` lines, produced + zero blocks, returned 1, and `agent_turn` blindly re-SENT the whole prompt + non-streaming — a SECOND full API call within the same second (the per-minute + burst). The parser now buffers the non-SSE body and, if it parses as a JSON + error, returns a distinct code so the caller surfaces it WITH backoff instead + of re-sending (single-send invariant: one logical attempt per turn). Also + auto-defaults `LARRY_NO_STREAM=1` on MobaXterm/Cygwin/MSYS (`_is_cygwin_like`) + where SSE parsing is fragile; an explicit `LARRY_NO_STREAM=0` still forces it on. +- **`ErrorPI` mangled error string fixed (CR-taint).** `— ErrorPI error: + rate_limit_error` was a carriage-return overprint: on MobaXterm the response + field `jq -r '.error.type'` carried a trailing `\r`, which (a) broke the + `case "$err_type" in rate_limit_error)` match → fell to the `%s — %s` default + (the stray ` — `), and (b) CR-returned the cursor so the terminal overprinted + "API error" → "ErrorPI". Fix: `strip_cr` on `err_type`/`err_msg` in + `_humanize_api_error`, and `err()`/`warn()`/`log()` now strip embedded CRs + defensively. (The v0.7.5 CR sweep missed the error-DISPLAY construction path.) +- **phi tier-5 notice fires once per session (was per-turn nag).** The + `tier-5 (presidio NER) disabled — sidecar not running` notice printed every + turn because `auto_detect_phi` runs inside `$(...)` command substitution and + the old `export _LARRY_PHI_TIER5_WARNED=1` flag died in the subshell. Now keyed + to a `$LARRY_HOME/.phi-notice-shown` file holding `SESSION_ID` — fires once per + session, survives the subshell, resets for a genuinely new session. Same-pattern + sweep caught the identical subshell-flag bug in `_auto_phi_b64_roundtrip`'s + python3-missing notice (`_LARRY_B64_PY3_WARNED`) — fixed the same way. + ## v0.8.4 — 2026-05-27 - **Installer/updater now detects HTML-sign-in-page responses and fails loud diff --git a/VERSION b/VERSION index b60d719..7ada0d3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.8.4 +0.8.5 diff --git a/larry.sh b/larry.sh index 22bb8bb..0cc5c88 100755 --- a/larry.sh +++ b/larry.sh @@ -65,7 +65,7 @@ set -o pipefail # ───────────────────────────────────────────────────────────────────────────── # Config # ───────────────────────────────────────────────────────────────────────────── -LARRY_VERSION="0.8.4" +LARRY_VERSION="0.8.5" LARRY_HOME="${LARRY_HOME:-$HOME/.larry}" # ───────────────────────────────────────────────────────────────────────────── @@ -143,9 +143,16 @@ else C_YELLOW=''; C_BLUE=''; C_MAGENTA=''; C_CYAN='' fi -log() { printf '%s[%s]%s %s\n' "$C_DIM" "$(date +%H:%M:%S)" "$C_RESET" "$*" >&2; } -err() { printf '%serror:%s %s\n' "$C_RED" "$C_RESET" "$*" >&2; } -warn() { printf '%swarn:%s %s\n' "$C_YELLOW" "$C_RESET" "$*" >&2; } +# v0.8.5: err()/warn()/log() strip embedded CRs from the assembled message +# before printing. Defense-in-depth for the "ErrorPI" class of bug — a CR that +# survives into a user-facing diagnostic carriage-returns the cursor and the +# terminal overprints the line (e.g. "API error" → "ErrorPI"). These run before +# lib/cygwin-safe.sh is sourced, so we strip inline via parameter expansion +# rather than calling strip_cr (not yet defined here). The primary fixes are at +# each message's construction site; this is the last line of defence. +log() { local _m="$*"; printf '%s[%s]%s %s\n' "$C_DIM" "$(date +%H:%M:%S)" "$C_RESET" "${_m//$'\r'/}" >&2; } +err() { local _m="$*"; printf '%serror:%s %s\n' "$C_RED" "$C_RESET" "${_m//$'\r'/}" >&2; } +warn() { local _m="$*"; printf '%swarn:%s %s\n' "$C_YELLOW" "$C_RESET" "${_m//$'\r'/}" >&2; } larry_say() { printf '%s%slarry>%s %s\n' "$C_MAGENTA" "$C_BOLD" "$C_RESET" "$*"; } # >>> fetch-safe inline (keep in sync with lib/fetch-safe.sh) >>> @@ -1718,10 +1725,18 @@ _auto_phi_b64_roundtrip() { # session for visibility. Strict-mode escape is handled upstream; # default mode falls back to the plain HL7-shape branch which catches # cleartext MSH/PID anyway. - if [ -z "${_LARRY_B64_PY3_WARNED:-}" ]; then + # v0.8.5 (same-pattern sweep, PROBLEM 4 class): _auto_phi_b64_roundtrip is + # called inside `$(...)` (see the tool-result path), so the old in-process + # `_LARRY_B64_PY3_WARNED` flag died in the subshell and this notice nagged + # on every tool-result turn lacking python3 — identical to the tier-5 bug. + # Persist the flag on disk keyed to SESSION_ID so it fires once per session. + local _b64_flag="$LARRY_HOME/.b64-py3-notice-shown" + local _b64_prev="" + [ -f "$_b64_flag" ] && _b64_prev=$(strip_cr "$(cat "$_b64_flag" 2>/dev/null)") + if [ "$_b64_prev" != "$SESSION_ID" ]; then printf '%sphi>%s base64 unwrap pass skipped: python3 not on PATH (install python3 to enable v0.8.1-c)\n' \ "$C_DIM" "$C_RESET" >&2 - _LARRY_B64_PY3_WARNED=1 + printf '%s' "$SESSION_ID" > "$_b64_flag" 2>/dev/null || true fi printf '' return 1 @@ -2077,13 +2092,26 @@ auto_detect_phi() { fi fi else - # Sidecar unreachable — emit a one-time per-session stderr warning. - if [ -z "${_LARRY_PHI_TIER5_WARNED:-}" ]; then - if [ -x "$LARRY_LIB_DIR/phi-sidecar.sh" ]; then - printf '%sphi>%s tier-5 (presidio NER) disabled — sidecar not running. Start with: %s/phi-sidecar.sh ensure\n' \ + # Sidecar unreachable — emit a ONE-TIME-PER-SESSION stderr notice. + # + # v0.8.5 (PROBLEM 4): the old guard used `export _LARRY_PHI_TIER5_WARNED=1`, + # but auto_detect_phi runs inside `$(...)` command substitution (see the + # REPL: `input=$(auto_detect_phi …)`). An export inside a subshell dies + # when the substitution returns, so the flag reset EVERY turn and the + # notice nagged on every message. On MobaXterm the Presidio sidecar can + # NEVER run (Mac/Linux-only per Bryan's accepted v0.8.2 decision), so this + # was a guaranteed per-turn nag. Fix: persist the flag on disk keyed to + # the current SESSION_ID, which survives the subshell boundary. Info stays + # available via `/phi-auto status`; we just stop nagging. + if [ -x "$LARRY_LIB_DIR/phi-sidecar.sh" ]; then + local _phi_notice_flag="$LARRY_HOME/.phi-notice-shown" + local _phi_notice_prev="" + [ -f "$_phi_notice_flag" ] && _phi_notice_prev=$(strip_cr "$(cat "$_phi_notice_flag" 2>/dev/null)") + if [ "$_phi_notice_prev" != "$SESSION_ID" ]; then + printf '%sphi>%s tier-5 (presidio NER) disabled — sidecar not running (expected on MobaXterm/Windows). Mac/Linux: %s/phi-sidecar.sh ensure. /phi-auto status for state.\n' \ "$C_DIM" "$C_RESET" "$LARRY_LIB_DIR" >&2 + printf '%s' "$SESSION_ID" > "$_phi_notice_flag" 2>/dev/null || true fi - export _LARRY_PHI_TIER5_WARNED=1 fi fi fi @@ -2379,6 +2407,13 @@ STATUS_oauth_7d_reset_epoch="" STATUS_oauth_representative="" # five_hour | seven_day | seven_day_opus | seven_day_sonnet STATUS_oauth_status="" # allowed | warning | rate_limited STATUS_api_reset_epoch="" # earliest of the *-reset RFC3339 timestamps, as epoch +# v0.8.5: rate-limit diagnosis state. Populated by _parse_response_headers on +# EVERY response (200 or 429). _humanize_api_error reads these to render an +# ACTIONABLE rate-limit message (which limit tripped + when it resets) instead +# of a bare "rate_limit_error". See PROBLEM 1 in the v0.8.5 deliverable. +STATUS_retry_after_secs="" # raw `retry-after` header value (seconds), if present +STATUS_rl_tripped_rail="" # which bucket is at/over limit: requests|input-tokens|output-tokens|tokens|unified-5h|unified-7d +STATUS_rl_reset_epoch="" # epoch when the tripped rail resets (best-effort) # session_cost is reused from _LARRY_INPUT/OUTPUT/CACHE_*_TOKENS via # _render_session_cost_dollars (no new state needed). # Session turns counter == _LARRY_TURNS (no new state needed). @@ -2514,22 +2549,67 @@ _parse_response_headers() { done [ -n "$earliest" ] && STATUS_api_reset_epoch="$earliest" + # ── v0.8.5: rate-limit diagnosis — retry-after + which rail tripped ────── + # On a 429 the response carries `retry-after` (seconds) and the relevant + # `*-remaining: 0` bucket. We capture both so _humanize_api_error can tell + # Bryan EXACTLY which limit fired and when it resets, instead of echoing a + # bare "rate_limit_error". strip_cr on every captured value — these flow + # into case/printf paths and a CRLF response (MobaXterm) would taint them. + local _ra + _ra=$(_header_value "$f" "retry-after") + _ra=$(strip_cr "$_ra") + [ -n "$_ra" ] && STATUS_retry_after_secs="$_ra" + + # Identify the tripped rail. Prefer the OAuth unified status when present; + # otherwise inspect the API-key *-remaining buckets for the one at 0. + STATUS_rl_tripped_rail="" + STATUS_rl_reset_epoch="" + if [ "$(strip_cr "$STATUS_oauth_status")" = "rate_limited" ]; then + # OAuth: representative-claim tells us which window is the binding one. + case "$(strip_cr "$STATUS_oauth_representative")" in + *7d*|*seven*) STATUS_rl_tripped_rail="unified-7d"; STATUS_rl_reset_epoch="$(strip_cr "$STATUS_oauth_7d_reset_epoch")" ;; + *) STATUS_rl_tripped_rail="unified-5h"; STATUS_rl_reset_epoch="$(strip_cr "$STATUS_oauth_5h_reset_epoch")" ;; + esac + else + # API-key family: find the bucket whose -remaining is 0 (or lowest). + local _rail _rem _reset_rfc _reset_epoch + for _rail in requests input-tokens output-tokens tokens; do + _rem=$(strip_cr "$(_header_value "$f" "anthropic-ratelimit-${_rail}-remaining")") + [ -z "$_rem" ] && continue + _rem=$(coerce_int "$_rem" -1) + if [ "$_rem" = "0" ]; then + STATUS_rl_tripped_rail="$_rail" + _reset_rfc=$(strip_cr "$(_header_value "$f" "anthropic-ratelimit-${_rail}-reset")") + _reset_epoch=$(_rfc3339_to_epoch "$_reset_rfc") + [ -n "$_reset_epoch" ] && STATUS_rl_reset_epoch="$_reset_epoch" + break + fi + done + fi + # ── Safety net: log raw OAuth headers for first 50 calls ───────────────── - # Only relevant in OAuth mode and only if we saw at least one unified-* - # header (no point logging API-key responses). - if [ "$LARRY_AUTH_MODE" = "oauth" ] \ - && [ -n "$STATUS_oauth_status$STATUS_oauth_5h_utilization$STATUS_oauth_7d_utilization" ] \ + # v0.8.5: ALSO log unconditionally whenever a `retry-after` header is present + # (i.e. a 429) so the NEXT rate-limit is always diagnosable from headers.log, + # regardless of auth mode or whether the unified-* family was emitted. The + # original v0.6.9 gate only fired in OAuth mode with a unified-* header — an + # API-key 429, or an OAuth 429 that omitted unified-*, would not be captured. + if { { [ "$LARRY_AUTH_MODE" = "oauth" ] \ + && [ -n "$STATUS_oauth_status$STATUS_oauth_5h_utilization$STATUS_oauth_7d_utilization" ]; } \ + || [ -n "$_ra" ]; } \ && [ "$STATUS_oauth_headers_logged" -lt "$STATUS_OAUTH_HEADER_LOG_LIMIT" ]; then local log_dir="$LARRY_HOME/log" mkdir -p "$log_dir" 2>/dev/null || true if [ -d "$log_dir" ]; then { - printf '── %s call #%d model=%s ──\n' \ + local _tag="" + [ -n "$_ra" ] && _tag=" *** 429 retry-after=${_ra}s rail=${STATUS_rl_tripped_rail:-unknown} ***" + printf '── %s call #%d model=%s%s ──\n' \ "$(date -Iseconds 2>/dev/null || date)" \ "$((STATUS_oauth_headers_logged + 1))" \ - "$LARRY_MODEL" + "$LARRY_MODEL" "$_tag" grep -i '^anthropic-' "$f" 2>/dev/null || true grep -i '^retry-after:' "$f" 2>/dev/null || true + grep -iE '^(http/|HTTP/)' "$f" 2>/dev/null || true printf '\n' } >> "$log_dir/headers.log" 2>/dev/null || true STATUS_oauth_headers_logged=$((STATUS_oauth_headers_logged + 1)) @@ -3357,6 +3437,18 @@ _humanize_api_error() { local err_type err_msg err_type=$(printf '%s' "$body" | jq -r '.error.type // empty' 2>/dev/null) err_msg=$(printf '%s' "$body" | jq -r '.error.message // empty' 2>/dev/null) + # v0.8.5 (PROBLEM 3 — "ErrorPI" fix): on MobaXterm/Cygwin the response body + # can arrive CRLF-translated, so `jq -r` emits a TRAILING \r on these fields. + # That \r (a) breaks the `case "$err_type" in rate_limit_error)` match below + # — the pattern compares against the literal "rate_limit_error\r" and FALLS + # THROUGH to the default `%s — %s` arm — and (b) when the resulting string + # reaches the terminal, the bare CR carriage-returns the cursor and the next + # write overprints "API error", rendering the mangled "ErrorPI". strip_cr on + # BOTH fields fixes the match AND removes the overprint source. (The v0.7.5 + # CR-sweep covered OAuth/prompt/path surfaces but missed this error-DISPLAY + # construction path.) + err_type=$(strip_cr "$err_type") + err_msg=$(strip_cr "$err_msg") case "$err_type" in authentication_error|invalid_request_error) case "$err_msg" in @@ -3370,28 +3462,115 @@ _humanize_api_error() { printf '%s — %s' "$err_type" "$err_msg" ;; rate_limit_error|overloaded_error) - printf 'Rate limited by Anthropic (%s). Wait a few seconds and retry. (%s)' "$err_type" "$err_msg" + _humanize_rate_limit "$err_type" "$err_msg" ;; not_found_error) printf 'API said not found — usually a bad model name. Current LARRY_MODEL=%s. (%s)' "$LARRY_MODEL" "$err_msg" ;; *) - [ -n "$err_type" ] && printf '%s — %s' "$err_type" "$err_msg" || printf '%s' "$body" + [ -n "$err_type" ] && printf '%s — %s' "$err_type" "$err_msg" || printf '%s' "$(strip_cr "$body")" ;; esac } +# _humanize_rate_limit TYPE MSG — render an ACTIONABLE rate-limit message using +# the rate-limit STATUS_* globals captured by _parse_response_headers from the +# 429's anthropic-ratelimit-* + retry-after headers. Tells Bryan WHICH limit +# tripped and WHEN it resets, vs the old bare "rate_limit_error". +# +# This is PROBLEM 1's user-facing half: with 90% of the 5h quota free, the most +# likely culprit is a short-window BURST rail (requests-per-minute / +# input-tokens-per-minute / output-tokens-per-minute), NOT the unified 5h/7d +# quota. Naming the rail makes that visible at a glance. +_humanize_rate_limit() { + local err_type="$1" err_msg="$2" + local rail reset_epoch retry_secs + rail=$(strip_cr "$STATUS_rl_tripped_rail") + reset_epoch=$(strip_cr "$STATUS_rl_reset_epoch") + retry_secs=$(strip_cr "$STATUS_retry_after_secs") + + # Friendly rail name + whether it's a burst rail or the unified quota. + local rail_label="" rail_kind="" + case "$rail" in + requests) rail_label="requests-per-minute" ; rail_kind="burst" ;; + input-tokens) rail_label="input-tokens-per-minute" ; rail_kind="burst" ;; + output-tokens) rail_label="output-tokens-per-minute" ; rail_kind="burst" ;; + tokens) rail_label="tokens-per-minute" ; rail_kind="burst" ;; + unified-5h) rail_label="unified 5-hour quota" ; rail_kind="quota" ;; + unified-7d) rail_label="unified 7-day quota" ; rail_kind="quota" ;; + *) rail_label="" ; rail_kind="" ;; + esac + + # Compute a "resets in Ns" hint. Prefer retry-after (authoritative); else + # derive from the rail's reset epoch. + local resets_in="" + if [ -n "$retry_secs" ]; then + resets_in="${retry_secs}s" + elif [ -n "$reset_epoch" ]; then + local now delta + now=$(coerce_int "$(date +%s 2>/dev/null)" 0) + reset_epoch=$(coerce_int "$reset_epoch" 0) + if [ "$now" -gt 0 ] && [ "$reset_epoch" -gt "$now" ]; then + delta=$(( reset_epoch - now )) + resets_in="${delta}s" + fi + fi + + if [ -n "$rail_label" ]; then + if [ "$rail_kind" = "burst" ]; then + # The headline diagnosis Bryan needs: a per-minute burst rail, not quota. + if [ -n "$resets_in" ]; then + printf 'rate limit: %s exhausted (short-window burst, NOT your 5h quota) — resets in %s; retrying with backoff' "$rail_label" "$resets_in" + else + printf 'rate limit: %s exhausted (short-window burst, NOT your 5h quota) — retrying with backoff' "$rail_label" + fi + else + if [ -n "$resets_in" ]; then + printf 'rate limit: %s reached — resets in %s; retrying with backoff' "$rail_label" "$resets_in" + else + printf 'rate limit: %s reached — retrying with backoff' "$rail_label" + fi + fi + return + fi + + # Headers did not name a rail (overloaded_error, or a 429 whose headers we + # could not parse). Fall back to a clear-but-generic message. + if [ "$err_type" = "overloaded_error" ]; then + printf 'Anthropic is overloaded (overloaded_error) — retrying with backoff.' + elif [ -n "$resets_in" ]; then + printf 'rate limited by Anthropic (%s) — resets in %s; retrying with backoff. (%s)' "$err_type" "$resets_in" "$err_msg" + else + printf 'rate limited by Anthropic (%s) — retrying with backoff. (%s)' "$err_type" "$err_msg" + fi +} + # parse_stream_to_response — read SSE from stdin, write events to stdout as # they arrive (for text deltas) AND assemble the equivalent non-streaming # response JSON to the file named in $1. Returns 0 on clean stream, 1 on # parse failure (caller falls back to non-streaming). # +# v0.8.5 (PROBLEM 2): a SECOND optional arg names an "error-body" file. When the +# streaming request gets an HTTP-error status (429/500/overloaded), the server +# responds with a PLAIN JSON error body — NOT an SSE event stream. Those lines +# don't start with "event: "/"data: ", so the old parser silently dropped them, +# produced zero content blocks, and returned 1. agent_turn then re-SENT the +# whole prompt non-streaming — a SECOND full API call within the same second. +# That stream→non-stream double-send is the BURST amplifier behind the +# rate_limit_error (PROBLEM 1): two back-to-back calls per turn trip a +# per-minute rail even with the 5h quota wide open. We now accumulate the raw +# non-SSE body; if the stream yields no blocks AND the body parses as a JSON +# error, we write it to the error-body file so the caller can surface it +# DIRECTLY (with backoff) instead of blindly re-sending. +# # Side effects: # - prints text deltas to stderr (the visible terminal output) as they arrive # - writes a JSON file with {content:[...], stop_reason, usage} on success +# - on a non-SSE JSON error body, writes that body to $2 (if given) # - updates _LARRY_LAST_ASSISTANT_TEXT parse_stream_to_response() { local out_file="$1" + local err_body_file="${2:-}" # State: ordered content blocks. We use parallel arrays keyed by block index. # block_type[i]: "text" | "tool_use" # block_text[i]: accumulated text (for text blocks) @@ -3400,10 +3579,23 @@ parse_stream_to_response() { local stop_reason="" out_tokens=0 in_tokens=0 cache_read=0 cache_write=0 local started_text=0 local line data event_type + # v0.8.5: accumulate any non-SSE lines (an HTTP-error JSON body arrives here). + local raw_nonsse="" while IFS= read -r line; do # Strip CR (curl on Windows / SSE servers often emit CRLF). line="${line%$'\r'}" + # v0.8.5: capture lines that are NOT SSE framing or blank — a JSON error + # body (429/overloaded) lands here. Bounded to avoid unbounded growth on a + # genuinely malformed long stream. + case "$line" in + 'event: '*|'data: '*|'') : ;; + *) + if [ "${#raw_nonsse}" -lt 65536 ]; then + raw_nonsse+="$line"$'\n' + fi + ;; + esac case "$line" in 'event: '*) event_type="${line#event: }"; continue ;; 'data: '*) @@ -3513,6 +3705,19 @@ parse_stream_to_response() { # If we never got any blocks, treat as failure. if [ "${#block_type[@]}" -eq 0 ]; then + # v0.8.5: was this a non-SSE HTTP-error JSON body (429/overloaded/500) + # rather than a mid-parse glitch? If the accumulated non-SSE text parses as + # JSON with an .error.type, hand it to the caller so it can surface the + # error WITH BACKOFF instead of re-sending the whole prompt (double-send = + # burst). Return 2 = "server already errored, body captured; do NOT re-send". + if [ -n "$err_body_file" ] && [ -n "$raw_nonsse" ]; then + local _et + _et=$(printf '%s' "$raw_nonsse" | jq -r '.error.type // empty' 2>/dev/null) + if [ -n "$_et" ]; then + printf '%s' "$raw_nonsse" > "$err_body_file" + return 2 + fi + fi return 1 fi @@ -3576,9 +3781,58 @@ parse_stream_to_response() { return 0 } +# _is_cygwin_like — true on MobaXterm / Cygwin / MSYS / Git-Bash-for-Windows, +# where the SSE stream carries CRLF line endings and the bundled jq is a +# Windows-native binary — the environment where streaming is most fragile. +# Detection: $OSTYPE (cygwin*/msys*), $MSYSTEM (set by MSYS2/Git-Bash), or the +# MobaXterm-specific $TERM_PROGRAM / a /usr/bin/cygpath presence check. +_is_cygwin_like() { + case "${OSTYPE:-}" in cygwin*|msys*) return 0 ;; esac + [ -n "${MSYSTEM:-}" ] && return 0 + case "$(uname -s 2>/dev/null)" in CYGWIN*|MINGW*|MSYS*) return 0 ;; esac + command -v cygpath >/dev/null 2>&1 && return 0 + return 1 +} + # Try streaming first; if anything goes wrong, fall back to non-streaming. # LARRY_NO_STREAM=1 disables streaming entirely. -LARRY_NO_STREAM="${LARRY_NO_STREAM:-0}" +# +# v0.8.5 (PROBLEM 2): default NO_STREAM=1 on Cygwin-like terminals. Streaming +# SSE parsing is fragile there (CRLF in the event stream, Windows-native jq), +# and even with the v0.8.5 single-send fix a clean non-streaming call is the +# safer default on those hosts. An EXPLICIT `LARRY_NO_STREAM=0` from the user +# still forces streaming on (opt back in); we only change the *default*. +if [ -n "${LARRY_NO_STREAM:-}" ]; then + LARRY_NO_STREAM="$LARRY_NO_STREAM" # user set it explicitly — honor it +elif _is_cygwin_like; then + LARRY_NO_STREAM=1 # safer default on MobaXterm/Cygwin +else + LARRY_NO_STREAM=0 +fi + +# _rate_limit_backoff_secs ATTEMPT — how long to sleep before retry ATTEMPT +# (1-based). Honors the `retry-after` header (authoritative) when present; +# otherwise exponential backoff 2,4,8,… capped at LARRY_RL_BACKOFF_MAX (30s). +# This replaces the old behaviour where a 429 either errored out immediately or +# (via the stream→non-stream double-send) fired a SECOND call with no spacing — +# the per-minute burst that tripped the rate limit even with quota free. +LARRY_RL_MAX_RETRIES="${LARRY_RL_MAX_RETRIES:-3}" +LARRY_RL_BACKOFF_MAX="${LARRY_RL_BACKOFF_MAX:-30}" +_rate_limit_backoff_secs() { + local attempt; attempt=$(coerce_int "${1:-1}" 1) + local ra; ra=$(coerce_int "$STATUS_retry_after_secs" 0) + if [ "$ra" -gt 0 ]; then + # Cap an absurd retry-after so we never hang the REPL for minutes. + [ "$ra" -gt 120 ] && ra=120 + printf '%s' "$ra" + return 0 + fi + # Exponential: 2^attempt, capped. + local secs=$(( 1 << attempt )) # attempt=1→2, 2→4, 3→8 + local cap; cap=$(coerce_int "$LARRY_RL_BACKOFF_MAX" 30) + [ "$secs" -gt "$cap" ] && secs="$cap" + printf '%s' "$secs" +} agent_turn() { local system_prompt="$1" @@ -3591,6 +3845,8 @@ agent_turn() { printf '%s' "$system_prompt" > "$system_file" _LARRY_TURNS=$(( _LARRY_TURNS + 1 )) + # v0.8.5: per-turn rate-limit retry budget (shared across the tool-use loop). + local _rl_attempts=0 while true; do local payload_file; payload_file=$(mktemp) @@ -3611,19 +3867,33 @@ agent_turn() { local used_stream=0 if [ "$stream_flag" = "true" ]; then - # Stream; parse_stream_to_response writes the synthetic response into $resp_file. - if call_api_stream "$payload_file" | parse_stream_to_response "$resp_file"; then + # Stream; parse_stream_to_response writes the synthetic response into + # $resp_file, and ANY non-SSE HTTP-error JSON body into $err_body_file. + # v0.8.5: capture the parser's exit code so we can tell apart: + # rc=0 clean stream → use it + # rc=2 server returned a JSON error body (429/overloaded/500) → surface + # it WITHOUT re-sending the prompt (re-sending = burst) + # rc=1 genuine mid-parse glitch → fall back to ONE non-streaming send + local err_body_file; err_body_file=$(mktemp) + call_api_stream "$payload_file" | parse_stream_to_response "$resp_file" "$err_body_file" + local _ps_rc=${PIPESTATUS[1]} + # Drain rate-limit headers BEFORE we decide what to do (so the 429 + # diagnosis + backoff sees this call's retry-after / rail). + _drain_pending_stream_headers + if [ "$_ps_rc" = "0" ]; then used_stream=1 resp=$(cat "$resp_file") + elif [ "$_ps_rc" = "2" ]; then + # Server already errored — use the captured body directly, do NOT + # re-send. The single-send invariant: one logical attempt per turn. + resp=$(cat "$err_body_file") else warn "streaming parse failed — falling back to non-streaming for this turn" - # Re-build payload without stream:true and call non-streaming. + # Re-build payload without stream:true and call non-streaming ONCE. jq 'del(.stream)' < "$payload_file" > "$payload_file.ns" && mv "$payload_file.ns" "$payload_file" resp=$(call_api "$payload_file") fi - # v0.6.9: drain rate-limit headers from the streaming curl (subshell - # could not update STATUS_* vars directly). - _drain_pending_stream_headers + rm -f "$err_body_file" else resp=$(call_api "$payload_file") fi @@ -3635,8 +3905,30 @@ agent_turn() { return 1 fi - local err_type; err_type=$(printf '%s' "$resp" | jq -r '.error.type // empty' 2>/dev/null) + local err_type; err_type=$(strip_cr "$(printf '%s' "$resp" | jq -r '.error.type // empty' 2>/dev/null)") if [ -n "$err_type" ]; then + # v0.8.5: on a rate_limit/overloaded error, retry with backoff (honoring + # retry-after) instead of failing the turn outright. This is the fix for + # PROBLEM 1: a tripped per-minute burst rail clears in seconds, so a + # single backed-off retry usually succeeds — and crucially does NOT add + # to the burst the way the old immediate stream→non-stream re-send did. + case "$err_type" in + rate_limit_error|overloaded_error) + _rl_attempts=$(( _rl_attempts + 1 )) + local _max; _max=$(coerce_int "$LARRY_RL_MAX_RETRIES" 3) + if [ "$_rl_attempts" -le "$_max" ]; then + local _wait; _wait=$(_rate_limit_backoff_secs "$_rl_attempts") + # Surface the actionable, header-parsed diagnosis (which rail + reset). + warn "$(_humanize_api_error "$resp")" + warn "backing off ${_wait}s before retry $_rl_attempts/$_max…" + sleep "$_wait" 2>/dev/null || true + continue # rebuild payload and re-attempt (the ONLY retry path) + fi + err "API error: $(_humanize_api_error "$resp") (gave up after $_max retries)" + rm -f "$tools_file" "$system_file" + return 1 + ;; + esac err "API error: $(_humanize_api_error "$resp")" rm -f "$tools_file" "$system_file" return 1