#!/usr/bin/env python3 """ larry-anywhere v0.8.2: Microsoft Presidio sidecar for free-text NER. Closes V1 from Vera's PHI-leak audit (the dominant real-world failure mode — patient names / addresses / un-keyworded dates in prose chat). Free-text PHI flows past the v0.7.3 tier-1/2/3/4 classifier because that classifier is HL7-segment-aware and keyword-driven, not a general entity recognizer. This sidecar runs Microsoft Presidio (spaCy backend + custom recognizers) as a persistent FastAPI service on 127.0.0.1:$LARRY_PHI_PORT (default 41189). Larry's main loop hits it via curl as the LAST tier of auto-PHI detection. Wire-up: - lib/phi-sidecar.sh — bash launcher / health-check / lifecycle - lib/phi-client.sh — bash client (phi_redact_text wrapper) - larry.sh:auto_detect_phi — calls phi_redact_text as tier-5 (post-explicit-marker, post-tier-1-to-4, before sending input to model) - install-larry.sh — offers to pip-install presidio + spacy + en_core_web_sm Benchmarks (Bryan's Mac, Apple Silicon, en_core_web_sm): cold start (model load): ~9 seconds warm latency (P50/P95): 20ms / 22ms (analyzer only) HTTP round-trip warm: ~57ms (curl --unix-socket via TCP fallback) First request post-startup pays a ~150ms tokenizer-warmup tax; thereafter within the 200ms-per-turn REPL budget Bryan specified. Failure mode: if Presidio fails to load (model missing, package broken), the process exits non-zero. The bash launcher detects this and tells the user. Larry's tier-5 silently no-ops when the sidecar is unreachable, preserving v0.8.1 behavior on hosts where Presidio isn't installed. Compatibility: requires Python 3.9+ (3.14 tested). MobaXterm/Cygwin compatibility is gated by spaCy's C-extension wheels; if pip install presidio_analyzer fails on Cygwin, this host stays on v0.8.1 + nudges per Bryan's accepted tradeoff. """ from __future__ import annotations import os import sys import time import logging logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(levelname)s %(message)s") log = logging.getLogger("phi-sidecar") try: from fastapi import FastAPI, Body from pydantic import BaseModel from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer from presidio_analyzer.nlp_engine import NlpEngineProvider from presidio_anonymizer import AnonymizerEngine import uvicorn except ImportError as e: sys.stderr.write(f"phi-sidecar: missing dependency ({e}); install with:\n") sys.stderr.write(" pip install presidio_analyzer presidio_anonymizer fastapi uvicorn\n") sys.stderr.write(" python -m spacy download en_core_web_sm\n") sys.exit(3) LARRY_PHI_PORT = int(os.environ.get("LARRY_PHI_PORT", "41189")) LARRY_PHI_HOST = os.environ.get("LARRY_PHI_HOST", "127.0.0.1") LARRY_PHI_MODEL = os.environ.get("LARRY_PHI_MODEL", "en_core_web_sm") # Module-scope request model. MUST be module-level, not function-local — # pydantic v2 + FastAPI introspection treats a closure-defined model as # query params (the symptom: 'Field required' on a "query" location for # the body param), which breaks the /redact endpoint silently. class RedactReq(BaseModel): text: str score_threshold: float = 0.3 # below this confidence we ignore def build_analyzer() -> AnalyzerEngine: """Load Presidio with en_core_web_sm (small/fast) + HL7-specific custom recognizers.""" config = { "nlp_engine_name": "spacy", "models": [{"lang_code": "en", "model_name": LARRY_PHI_MODEL}], } nlp_engine = NlpEngineProvider(nlp_configuration=config).create_engine() analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"]) # Custom recognizers tuned for HL7/Cloverleaf operator chat. These run # IN ADDITION TO Presidio's built-in PII recognizers (PERSON, LOCATION, # DATE_TIME, PHONE_NUMBER, US_SSN, EMAIL_ADDRESS, etc.). # # HL7_MRN: 6-12 digit numeric, looser than NPI's strict 10-digit rule. # Catches "check 623000286" prose where the keyword-based tier-2 missed. analyzer.registry.add_recognizer( PatternRecognizer( supported_entity="HL7_MRN", patterns=[Pattern("hl7_mrn_6_12", r"\b\d{6,12}\b", 0.30)], context=["mrn", "patient", "record", "account", "acct", "visit", "encounter", "csn"], ) ) # HL7_CARET_NAME: "SMITH^JOHN" / "SMITH^JOHN^Q" pattern outside Tier-3 # context. The v0.7.3 Tier-3 only fires when PID.3/PID.5/etc. is in the # same line; this recognizer catches the caret-name itself. analyzer.registry.add_recognizer( PatternRecognizer( supported_entity="HL7_CARET_NAME", patterns=[Pattern("caret_name", r"\b[A-Z][A-Z\-']+\^[A-Z][A-Z\-']+(\^[A-Z][A-Z\-']+)?\b", 0.85)], ) ) # HL7_BARE_PHONE_10: plain "5551234567" (no dashes/parens) — Tier 1 # requires formatting. Limit confidence so plain numbers in code stay safe. analyzer.registry.add_recognizer( PatternRecognizer( supported_entity="HL7_PHONE_BARE", patterns=[Pattern("phone_10_bare", r"\b[2-9]\d{2}[2-9]\d{6}\b", 0.20)], context=["phone", "tel", "telephone", "contact", "cell", "mobile"], ) ) return analyzer def main(): log.warning("loading presidio (this takes ~5-10 seconds the first time)...") t0 = time.time() analyzer = build_analyzer() anonymizer = AnonymizerEngine() log.warning(f"presidio ready in {(time.time()-t0)*1000:.0f} ms; listening on {LARRY_PHI_HOST}:{LARRY_PHI_PORT}") app = FastAPI(title="larry-phi-sidecar", version="0.8.2") @app.post("/redact") def redact(req: RedactReq = Body(...)): t0 = time.time() results = analyzer.analyze(text=req.text, language="en", score_threshold=req.score_threshold) anon = anonymizer.anonymize(text=req.text, analyzer_results=results) return { "redacted": anon.text, "entities": [ {"type": r.entity_type, "start": r.start, "end": r.end, "score": r.score} for r in results ], "latency_ms": (time.time() - t0) * 1000, } @app.get("/health") def health(): return {"status": "ok", "model": LARRY_PHI_MODEL, "port": LARRY_PHI_PORT} uvicorn.run(app, host=LARRY_PHI_HOST, port=LARRY_PHI_PORT, log_level="warning") if __name__ == "__main__": try: main() except KeyboardInterrupt: sys.exit(0)