lsdefine · shenhao-stu · May 17, 2026
diff --git a/.gitignore b/.gitignore
@@ -101,6 +101,9 @@ memory/L4_raw_sessions/*
 # Code Review Principles
 !memory/code_review_principles.md
 
+# Review Mode SOP
+!memory/review_sop.md
+
 # Visual Studio
 .vs/
 restore_commit.txt

diff --git a/frontends/cost_tracker.py b/frontends/cost_tracker.py
@@ -1,13 +1,14 @@
-"""Per-thread LLM token usage, captured via llmcore monkey-patches.
-
-`install()` wraps `llmcore._record_usage` (covers all three API modes) and
-`llmcore.print` (the `messages` SSE path emits the final `output_tokens`
-only via `[Output] tokens=N`, never through `_record_usage`). Tracking is
-keyed by `threading.current_thread().name`; each TUI session runs the
-agent on a uniquely named thread (`ga-tui-agent-<id>`), so `/cost` is a
-thread lookup.
+"""Per-thread LLM token usage via llmcore monkey-patches.
+
+`install()` wraps `llmcore._record_usage` + `llmcore.print` (the SSE
+`messages` path only emits final `output_tokens` through `[Output] tokens=N`).
+Trackers are keyed by `threading.current_thread().name`; each TUI session
+runs its agent on `ga-tui-agent-<id>`, so `/cost` is a thread lookup.
+
+Subagent processes are out-of-process, so `scan_subagent_logs` parses the
+same `[Cache]` / `[Output]` print lines from `temp/*/stdout.log`.
 """
-import re, threading, time
+import glob, os, re, threading, time
 from dataclasses import dataclass, field
 
 
@@ -18,8 +19,9 @@ class TokenStats:
     output: int = 0
     cache_create: int = 0
     cache_read: int = 0
-    # Latest request's effective prompt size — used for the % context-left line.
+    # Latest single-LLM-call sizes — drive the spinner's `↑ N · ↓ M`.
     last_input: int = 0
+    last_output: int = 0
     started_at: float = field(default_factory=time.time)
 
     def total_input_side(self) -> int:
@@ -36,50 +38,69 @@ def elapsed_seconds(self) -> float:
         return max(0.0, time.time() - self.started_at)
 
 
-# Best-effort model → context window. `startswith` match; None hides the line.
-_CTX_LIMITS: list[tuple[str, int]] = [
-    ("claude-sonnet-4-5", 1_000_000),
-    ("claude-opus-4",       200_000),
-    ("claude-haiku-4",      200_000),
-    ("claude-sonnet-4",     200_000),
-    ("claude-3-5-sonnet",   200_000),
-    ("claude-3-5-haiku",    200_000),
-    ("claude-3-7-sonnet",   200_000),
-    ("claude-3-opus",       200_000),
-    ("claude-3-haiku",      200_000),
-    ("claude-3-sonnet",     200_000),
-    ("gpt-5-pro",           400_000),
-    ("gpt-5",               256_000),
-    ("gpt-4o",              128_000),
-    ("gpt-4-turbo",         128_000),
-    ("gpt-4",                 8_192),
-    ("o1",                  200_000),
-    ("o3",                  200_000),
-    ("o4",                  200_000),
-    ("gemini-2.5",        2_000_000),
-    ("gemini-2",          1_000_000),
-    ("gemini-1.5",        1_000_000),
-    ("glm-5",               256_000),
-    ("glm-4",               128_000),
-    ("qwen",                128_000),
-    ("deepseek",             64_000),
-    ("kimi",                200_000),
-    ("moonshot",            200_000),
-]
-
-
-def context_limit_for(model: str | None) -> int | None:
-    if not model: return None
-    m = model.lower()
-    for prefix, limit in _CTX_LIMITS:
-        if m.startswith(prefix): return limit
-    return None
+# GA's real context budget lives on `BaseSession.context_win` (chars). The
+# trim trigger is `context_win * 3` (see llmcore.trim_messages_history), so
+# `/cost` compares actual-history chars against that cap for consistent units.
+def context_window_chars(backend) -> int:
+    """`context_win * 3` — the char cap before `trim_messages_history` kicks
+    in. Reads dynamically so a `mykey.py` override propagates. Returns 0 on
+    bad/missing backend so the caller can hide the row."""
+    try:
+        return int(getattr(backend, 'context_win', 0)) * 3
+    except (TypeError, ValueError):
+        return 0
+
+
+def current_input_chars(backend) -> int:
+    """Char-size of the message history (same unit as `trim_messages_history`)."""
+    try:
+        import json as _json
+        history = getattr(backend, 'history', None) or []
+        return sum(len(_json.dumps(m, ensure_ascii=False)) for m in history)
+    except Exception:
+        return 0
 
 
 _trackers: dict[str, TokenStats] = {}
 _lock = threading.Lock()
 _OUT_RE = re.compile(r'\[Output\]\s+tokens=(\d+)')
+_CACHE_RE_NEW = re.compile(r'\[Cache\]\s+input=(\d+)\s+creation=(\d+)\s+read=(\d+)')
+_CACHE_RE_OLD = re.compile(r'\[Cache\]\s+input=(\d+)\s+cached=(\d+)')
 _INSTALLED = False
+_SUBAGENT_GLOB = os.path.join("temp", "*", "stdout.log")
+
+
+def scan_subagent_logs(since: float = 0.0, root: str | None = None) -> TokenStats:
+    """Aggregate subagent tokens from `temp/<task>/stdout.log` files; pass
+    `since=tui_start_time` to scope to this run. Best-effort: bad logs skipped."""
+    out = TokenStats()
+    if since > 0: out.started_at = since
+    pattern = os.path.join(root, _SUBAGENT_GLOB) if root else _SUBAGENT_GLOB
+    for p in glob.glob(pattern):
+        try:
+            if since and os.path.getmtime(p) < since: continue
+            with open(p, encoding="utf-8", errors="ignore") as f:
+                for line in f:
+                    if line.startswith("[Output]"):
+                        m = _OUT_RE.match(line)
+                        if m:
+                            out.output += int(m.group(1)); out.requests += 1
+                    elif line.startswith("[Cache]"):
+                        # messages → `input=N creation=C read=R` (input excl. cache);
+                        # chat_completions / responses → `input=N cached=R` (input incl. cached).
+                        m = _CACHE_RE_NEW.match(line)
+                        if m:
+                            i, c, r = int(m.group(1)), int(m.group(2)), int(m.group(3))
+                            out.input += i
+                            out.cache_create += c; out.cache_read += r
+                            continue
+                        m = _CACHE_RE_OLD.match(line)
+                        if m:
+                            i, r = int(m.group(1)), int(m.group(2))
+                            out.input += max(0, i - r); out.cache_read += r
+        except OSError:
+            continue
+    return out
 
 
 def get(thread_name: str) -> TokenStats:
@@ -107,31 +128,32 @@ def install() -> None:
     orig_record, orig_print = llmcore._record_usage, print
 
     def record_patched(usage, api_mode):
+        # Handles INPUT / CACHE only; OUTPUT comes via `[Output]` print_patched
+        # below (the SSE path emits it that way; double-counting was the prior bug).
         try:
             if usage:
                 t = get(threading.current_thread().name)
                 t.requests += 1
                 if api_mode == 'messages':
-                    # SSE delivers final output via [Output] print; non-stream
-                    # delivers it here. `output_tokens` in stream message_start
-                    # is a 0–1 placeholder, acceptable noise.
                     inp = int(usage.get('input_tokens', 0) or 0)
                     cc = int(usage.get('cache_creation_input_tokens', 0) or 0)
                     cr = int(usage.get('cache_read_input_tokens', 0) or 0)
                     t.input += inp; t.cache_create += cc; t.cache_read += cr
-                    t.output += int(usage.get('output_tokens', 0) or 0)
+                    # Non-stream `messages` skips the [Output] print, so count
+                    # output_tokens here; SSE message_start carries a 1-token
+                    # placeholder to skip.
+                    out = int(usage.get('output_tokens', 0) or 0)
+                    if out > 1: t.output += out; t.last_output = out
                     t.last_input = inp + cc + cr
                 elif api_mode == 'chat_completions':
                     cached = int((usage.get('prompt_tokens_details') or {}).get('cached_tokens', 0) or 0)
                     inp = int(usage.get('prompt_tokens', 0) or 0) - cached
                     t.input += inp; t.cache_read += cached
-                    t.output += int(usage.get('completion_tokens', 0) or 0)
                     t.last_input = inp + cached
                 elif api_mode == 'responses':
                     cached = int((usage.get('input_tokens_details') or {}).get('cached_tokens', 0) or 0)
                     inp = int(usage.get('input_tokens', 0) or 0) - cached
                     t.input += inp; t.cache_read += cached
-                    t.output += int(usage.get('output_tokens', 0) or 0)
                     t.last_input = inp + cached
         except Exception: pass
         return orig_record(usage, api_mode)
@@ -141,7 +163,10 @@ def print_patched(*args, **kwargs):
         try:
             if args and isinstance(args[0], str):
                 m = _OUT_RE.match(args[0])
-                if m: get(threading.current_thread().name).output += int(m.group(1))
+                if m:
+                    t = get(threading.current_thread().name)
+                    n = int(m.group(1))
+                    t.output += n; t.last_output = n
         except Exception: pass
         return orig_print(*args, **kwargs)
     llmcore.print = print_patched

diff --git a/frontends/plan_state.py b/frontends/plan_state.py
@@ -0,0 +1,180 @@
+"""Plan / todo state — pure stdlib, no UI framework dependency.
+
+API:
+  extract(text)                   → [(content, "open"|"done"), …]
+  is_active(agent, messages=None) → plan mode on (stash OR per-session msg ref)
+  resolve_path(agent, messages=None) → live plan.md path (or None)
+  find_path_in_messages(messages) → most recent plan.md path mentioned
+  current_step(messages)          → latest `当前步骤：…` snippet (or "")
+  summary(items)                  → (n_done, n_total)
+  is_complete(items)              → all done (or empty)
+
+Supported task-line shapes (all matched by `extract`):
+  - [ ] foo              ← bullet + open
+  - [x] foo              ← bullet + done
+  1. [✓] foo             ← numbered + done
+  2. [✓ 2026-05-16] foo  ← numbered + timestamped done, content after bracket
+  3. [✓ 已生成: foo]      ← numbered + done with description *inside* bracket
+  4. [D][P] foo          ← two marker groups (delegate + parallel), still open
+  5. [D] foo             ← non-standard marker "D" → open (not done)
+"""
+from __future__ import annotations
+import os, re
+from typing import Optional
+
+_DONE_CHARS = set("xX✓✔√☑")
+# Newline-insert before a bullet stuck to JSON debris (`{"content": "- [ ] …`).
+_GLUE_RE = re.compile(r"(?<!\n)((?:[-*+]|\d+\s*[.)、:）]) \[)")
+_BULLET_RE = re.compile(r"^\s*(?:[-*+]|\d+\s*[.)、:）])\s+")
+_BRACKET_RE = re.compile(r"\[([^\]]*)\]")
+# Strip `✓ ` / `x ` / timestamp prefix when bracket content is used as title.
+_INLINE_STRIP_RE = re.compile(
+    r"^[" + re.escape("".join(_DONE_CHARS)) + r"]\s*(?:\d{4}-\d{2}-\d{2}\s+\d{1,2}:\d{2}(?::\d{2})?\s*)?"
+)
+_DEBRIS_RE = re.compile(r'["\\<].*$')
+# Strip markdown emphasis since planbar renders rich.Text, not Markdown.
+_MD_EMPHASIS_RE = re.compile(
+    r"\*\*([^*\n]+)\*\*|\*([^*\n]+)\*|__([^_\n]+)__|_([^_\n]+)_|`([^`\n]+)`"
+)
+def _strip_md(s: str) -> str:
+    return _MD_EMPHASIS_RE.sub(lambda m: next(g for g in m.groups() if g is not None), s)
+
+
+def _has_done_glyph(marker: str) -> bool:
+    return any(c in _DONE_CHARS for c in marker)
+
+
+def extract(text: str) -> list[tuple[str, str]]:
+    if not text: return []
+    norm = text.replace("\\n", "\n") if "\\n" in text else text
+    norm = _GLUE_RE.sub(r"\n\1", norm)
+    found: dict[str, str] = {}
+    for line in norm.splitlines():
+        head = _BULLET_RE.match(line)
+        if not head: continue
+        rest = line[head.end():]
+        groups: list[str] = []
+        # Consume any number of consecutive `[...]` groups — covers `[D][P]`
+        # task-type chains as well as the plain `[ ]` / `[x]` single form.
+        while True:
+            b = _BRACKET_RE.match(rest)
+            if not b: break
+            groups.append(b.group(1))
+            rest = rest[b.end():]
+        if not groups: continue
+        is_done = any(_has_done_glyph(g) for g in groups)
+        inline = rest.strip()
+        if inline:
+            content = inline
+        elif is_done:
+            # `[✓ description]` shape — description lives inside the bracket
+            # next to the glyph. Strip the glyph + optional timestamp.
+            done_g = next(g for g in groups if _has_done_glyph(g))
+            content = _INLINE_STRIP_RE.sub("", done_g).strip()
+        else:
+            continue
+        k = _strip_md(_DEBRIS_RE.sub("", content).strip())
+        if not k: continue
+        status = "done" if is_done else "open"
+        # Same content seen twice — done wins over open.
+        if k not in found or status == "done":
+            found[k] = status
+    return list(found.items())
+
+
+def _stashed_plan_path(agent) -> str:
+    # First non-empty `working['in_plan_mode']` from (handler, agent).
+    for src in (getattr(agent, "handler", None), agent):
+        p = ((getattr(src, "working", None) or {}).get("in_plan_mode") or "").strip()
+        if p: return p
+    return ""
+
+
+def _resolve_stashed(p: str) -> Optional[str]:
+    if not p: return None
+    rel = p.lstrip("./\\")
+    cwd = os.getcwd()
+    for c in (p, os.path.join(cwd, "temp", rel), os.path.join(cwd, rel)):
+        if os.path.isfile(c) and os.path.getsize(c) > 0: return c
+    return None
+
+
+# Strict per-session discovery — scan this session's own messages only.
+_PATH_RE = re.compile(r"""((?:\.\/)?(?:temp\/)?plan_[A-Za-z0-9_\-]+\/plan\.md)""")
+
+
+def _slice(messages, start_idx: int):
+    if not messages: return []
+    if start_idx <= 0: return list(messages)
+    return list(messages)[start_idx:]
+
+
+def find_path_in_messages(messages, start_idx: int = 0) -> Optional[str]:
+    """Latest existing `plan_XXX/plan.md` referenced after `start_idx`.
+    Items can be `ChatMessage`-like (`.content`) or plain strings;
+    only paths that exist on disk are returned."""
+    sliced = _slice(messages, start_idx)
+    if not sliced: return None
+    for m in reversed(sliced):
+        text = getattr(m, "content", None)
+        if text is None: text = m if isinstance(m, str) else ""
+        if not text or "plan.md" not in text: continue
+        for hit in reversed(_PATH_RE.findall(text)):
+            p = _resolve_stashed(hit.strip().strip("\"'"))
+            if p: return p
+    return None
+
+
+# Prefer concise `<summary>` narrative over the long plan-item echo;
+# treat `❌ 当前步骤:` as "step done", not "current step".
+_SUMMARY_STEP_RE = re.compile(
+    r"<summary>[^<]*?当前步骤[:：]\s*([^<\n]{1,160})</summary>", re.DOTALL)
+_STEP_RE = re.compile(r"📌\s*当前步骤[:：]\s*([^\n。！!？?]{1,160})")
+_DONE_STEP_RE = re.compile(r"❌\s*当前步骤[:：]")
+
+
+def current_step(messages, start_idx: int = 0, max_len: int = 60) -> str:
+    """Latest `当前步骤：…` snippet; `<summary>` form preferred, `❌`-prefixed
+    skipped. Trimmed to `max_len` chars so it fits the 5-row plan card."""
+    sliced = _slice(messages, start_idx)
+    if not sliced: return ""
+
+    def _clean(s: str) -> str:
+        return _strip_md(re.sub(r"\s+", " ", s).strip().rstrip(" ：:—-"))
+
+    def _cap(s: str) -> str:
+        s = _clean(s)
+        if len(s) <= max_len: return s
+        return s[:max_len - 1].rstrip() + "…"
+
+    for m in reversed(sliced):
+        text = getattr(m, "content", None)
+        if text is None: text = m if isinstance(m, str) else ""
+        if not text or "当前步骤" not in text: continue
+        hits = _SUMMARY_STEP_RE.findall(text)
+        if hits: return _cap(hits[-1])
+        for raw in reversed(_STEP_RE.findall(text)):
+            if _DONE_STEP_RE.search(raw): continue
+            return _cap(raw)
+    return ""
+
+
+def is_active(agent, messages=None, start_idx: int = 0) -> bool:
+    """Plan mode is on. Primary: `working['in_plan_mode']`. Fallback:
+    a `plan_*/plan.md` referenced in this session's messages (no global scan)."""
+    if _stashed_plan_path(agent): return True
+    return find_path_in_messages(messages, start_idx) is not None
+
+
+def resolve_path(agent, messages=None, start_idx: int = 0) -> Optional[str]:
+    p = _resolve_stashed(_stashed_plan_path(agent))
+    if p: return p
+    return find_path_in_messages(messages, start_idx)
+
+
+def summary(items: list[tuple[str, str]]) -> tuple[int, int]:
+    return sum(1 for _, st in items if st == "done"), len(items)
+
+
+def is_complete(items: list[tuple[str, str]]) -> bool:
+    return not items or all(st == "done" for _, st in items)
diff --git a/frontends/tuiapp.py b/frontends/tuiapp.py
@@ -86,8 +86,11 @@ def stash(match: re.Match[str]) -> str:
         placeholders.append(match.group(0))
         return f"\x00PH{len(placeholders) - 1}\x00"
 
-    safe = re.sub(r"`{4,}.*?`{4,}", stash, text, flags=re.DOTALL)
-    safe = re.sub(r"`{4,}[^`].*$", stash, safe, flags=re.DOTALL)
+    # Line-anchored fence matcher — see tuiapp_v2.fold_turns for rationale.
+    # Unanchored variant mis-paired backticks embedded in file_read output
+    # with later real fences, swallowing turn markers and ballooning the
+    # final "text" segment to MBs (1.85s markdown render on /continue).
+    safe = re.sub(r"^`{4,}.*?^`{4,}\n?", stash, text, flags=re.DOTALL | re.MULTILINE)
     parts = re.split(r"(\**LLM Running \(Turn \d+\) \.\.\.\**)", safe)
 
     def restore(part: str) -> str: