diff --git a/.gitignore b/.gitignore
index 1bbf72e4..6aab88e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -101,6 +101,9 @@ memory/L4_raw_sessions/*
 # Code Review Principles
 !memory/code_review_principles.md
 
+# Review Mode SOP
+!memory/review_sop.md
+
 # Visual Studio
 .vs/
 restore_commit.txt
diff --git a/frontends/cost_tracker.py b/frontends/cost_tracker.py
index 86d900b0..75c7396a 100644
--- a/frontends/cost_tracker.py
+++ b/frontends/cost_tracker.py
@@ -1,13 +1,14 @@
-"""Per-thread LLM token usage, captured via llmcore monkey-patches.
-
-`install()` wraps `llmcore._record_usage` (covers all three API modes) and
-`llmcore.print` (the `messages` SSE path emits the final `output_tokens`
-only via `[Output] tokens=N`, never through `_record_usage`). Tracking is
-keyed by `threading.current_thread().name`; each TUI session runs the
-agent on a uniquely named thread (`ga-tui-agent-<id>`), so `/cost` is a
-thread lookup.
+"""Per-thread LLM token usage via llmcore monkey-patches.
+
+`install()` wraps `llmcore._record_usage` + `llmcore.print` (the SSE
+`messages` path only emits final `output_tokens` through `[Output] tokens=N`).
+Trackers are keyed by `threading.current_thread().name`; each TUI session
+runs its agent on `ga-tui-agent-<id>`, so `/cost` is a thread lookup.
+
+Subagent processes are out-of-process, so `scan_subagent_logs` parses the
+same `[Cache]` / `[Output]` print lines from `temp/*/stdout.log`.
 """
-import re, threading, time
+import glob, os, re, threading, time
 from dataclasses import dataclass, field
 
 
@@ -18,8 +19,9 @@ class TokenStats:
     output: int = 0
     cache_create: int = 0
     cache_read: int = 0
-    # Latest request's effective prompt size — used for the % context-left line.
+    # Latest single-LLM-call sizes — drive the spinner's `↑ N · ↓ M`.
     last_input: int = 0
+    last_output: int = 0
     started_at: float = field(default_factory=time.time)
 
     def total_input_side(self) -> int:
@@ -36,50 +38,69 @@ def elapsed_seconds(self) -> float:
         return max(0.0, time.time() - self.started_at)
 
 
-# Best-effort model → context window. `startswith` match; None hides the line.
-_CTX_LIMITS: list[tuple[str, int]] = [
-    ("claude-sonnet-4-5", 1_000_000),
-    ("claude-opus-4",       200_000),
-    ("claude-haiku-4",      200_000),
-    ("claude-sonnet-4",     200_000),
-    ("claude-3-5-sonnet",   200_000),
-    ("claude-3-5-haiku",    200_000),
-    ("claude-3-7-sonnet",   200_000),
-    ("claude-3-opus",       200_000),
-    ("claude-3-haiku",      200_000),
-    ("claude-3-sonnet",     200_000),
-    ("gpt-5-pro",           400_000),
-    ("gpt-5",               256_000),
-    ("gpt-4o",              128_000),
-    ("gpt-4-turbo",         128_000),
-    ("gpt-4",                 8_192),
-    ("o1",                  200_000),
-    ("o3",                  200_000),
-    ("o4",                  200_000),
-    ("gemini-2.5",        2_000_000),
-    ("gemini-2",          1_000_000),
-    ("gemini-1.5",        1_000_000),
-    ("glm-5",               256_000),
-    ("glm-4",               128_000),
-    ("qwen",                128_000),
-    ("deepseek",             64_000),
-    ("kimi",                200_000),
-    ("moonshot",            200_000),
-]
-
-
-def context_limit_for(model: str | None) -> int | None:
-    if not model: return None
-    m = model.lower()
-    for prefix, limit in _CTX_LIMITS:
-        if m.startswith(prefix): return limit
-    return None
+# GA's real context budget lives on `BaseSession.context_win` (chars). The
+# trim trigger is `context_win * 3` (see llmcore.trim_messages_history), so
+# `/cost` compares actual-history chars against that cap for consistent units.
+def context_window_chars(backend) -> int:
+    """`context_win * 3` — the char cap before `trim_messages_history` kicks
+    in. Reads dynamically so a `mykey.py` override propagates. Returns 0 on
+    bad/missing backend so the caller can hide the row."""
+    try:
+        return int(getattr(backend, 'context_win', 0)) * 3
+    except (TypeError, ValueError):
+        return 0
+
+
+def current_input_chars(backend) -> int:
+    """Char-size of the message history (same unit as `trim_messages_history`)."""
+    try:
+        import json as _json
+        history = getattr(backend, 'history', None) or []
+        return sum(len(_json.dumps(m, ensure_ascii=False)) for m in history)
+    except Exception:
+        return 0
 
 
 _trackers: dict[str, TokenStats] = {}
 _lock = threading.Lock()
 _OUT_RE = re.compile(r'\[Output\]\s+tokens=(\d+)')
+_CACHE_RE_NEW = re.compile(r'\[Cache\]\s+input=(\d+)\s+creation=(\d+)\s+read=(\d+)')
+_CACHE_RE_OLD = re.compile(r'\[Cache\]\s+input=(\d+)\s+cached=(\d+)')
 _INSTALLED = False
+_SUBAGENT_GLOB = os.path.join("temp", "*", "stdout.log")
+
+
+def scan_subagent_logs(since: float = 0.0, root: str | None = None) -> TokenStats:
+    """Aggregate subagent tokens from `temp/<task>/stdout.log` files; pass
+    `since=tui_start_time` to scope to this run. Best-effort: bad logs skipped."""
+    out = TokenStats()
+    if since > 0: out.started_at = since
+    pattern = os.path.join(root, _SUBAGENT_GLOB) if root else _SUBAGENT_GLOB
+    for p in glob.glob(pattern):
+        try:
+            if since and os.path.getmtime(p) < since: continue
+            with open(p, encoding="utf-8", errors="ignore") as f:
+                for line in f:
+                    if line.startswith("[Output]"):
+                        m = _OUT_RE.match(line)
+                        if m:
+                            out.output += int(m.group(1)); out.requests += 1
+                    elif line.startswith("[Cache]"):
+                        # messages → `input=N creation=C read=R` (input excl. cache);
+                        # chat_completions / responses → `input=N cached=R` (input incl. cached).
+                        m = _CACHE_RE_NEW.match(line)
+                        if m:
+                            i, c, r = int(m.group(1)), int(m.group(2)), int(m.group(3))
+                            out.input += i
+                            out.cache_create += c; out.cache_read += r
+                            continue
+                        m = _CACHE_RE_OLD.match(line)
+                        if m:
+                            i, r = int(m.group(1)), int(m.group(2))
+                            out.input += max(0, i - r); out.cache_read += r
+        except OSError:
+            continue
+    return out
 
 
 def get(thread_name: str) -> TokenStats:
@@ -107,31 +128,32 @@ def install() -> None:
     orig_record, orig_print = llmcore._record_usage, print
 
     def record_patched(usage, api_mode):
+        # Handles INPUT / CACHE only; OUTPUT comes via `[Output]` print_patched
+        # below (the SSE path emits it that way; double-counting was the prior bug).
         try:
             if usage:
                 t = get(threading.current_thread().name)
                 t.requests += 1
                 if api_mode == 'messages':
-                    # SSE delivers final output via [Output] print; non-stream
-                    # delivers it here. `output_tokens` in stream message_start
-                    # is a 0–1 placeholder, acceptable noise.
                     inp = int(usage.get('input_tokens', 0) or 0)
                     cc = int(usage.get('cache_creation_input_tokens', 0) or 0)
                     cr = int(usage.get('cache_read_input_tokens', 0) or 0)
                     t.input += inp; t.cache_create += cc; t.cache_read += cr
-                    t.output += int(usage.get('output_tokens', 0) or 0)
+                    # Non-stream `messages` skips the [Output] print, so count
+                    # output_tokens here; SSE message_start carries a 1-token
+                    # placeholder to skip.
+                    out = int(usage.get('output_tokens', 0) or 0)
+                    if out > 1: t.output += out; t.last_output = out
                     t.last_input = inp + cc + cr
                 elif api_mode == 'chat_completions':
                     cached = int((usage.get('prompt_tokens_details') or {}).get('cached_tokens', 0) or 0)
                     inp = int(usage.get('prompt_tokens', 0) or 0) - cached
                     t.input += inp; t.cache_read += cached
-                    t.output += int(usage.get('completion_tokens', 0) or 0)
                     t.last_input = inp + cached
                 elif api_mode == 'responses':
                     cached = int((usage.get('input_tokens_details') or {}).get('cached_tokens', 0) or 0)
                     inp = int(usage.get('input_tokens', 0) or 0) - cached
                     t.input += inp; t.cache_read += cached
-                    t.output += int(usage.get('output_tokens', 0) or 0)
                     t.last_input = inp + cached
         except Exception: pass
         return orig_record(usage, api_mode)
@@ -141,7 +163,10 @@ def print_patched(*args, **kwargs):
         try:
             if args and isinstance(args[0], str):
                 m = _OUT_RE.match(args[0])
-                if m: get(threading.current_thread().name).output += int(m.group(1))
+                if m:
+                    t = get(threading.current_thread().name)
+                    n = int(m.group(1))
+                    t.output += n; t.last_output = n
         except Exception: pass
         return orig_print(*args, **kwargs)
     llmcore.print = print_patched
diff --git a/frontends/plan_state.py b/frontends/plan_state.py
new file mode 100644
index 00000000..c93fb42e
--- /dev/null
+++ b/frontends/plan_state.py
@@ -0,0 +1,180 @@
+"""Plan / todo state — pure stdlib, no UI framework dependency.
+
+API:
+  extract(text)                   → [(content, "open"|"done"), …]
+  is_active(agent, messages=None) → plan mode on (stash OR per-session msg ref)
+  resolve_path(agent, messages=None) → live plan.md path (or None)
+  find_path_in_messages(messages) → most recent plan.md path mentioned
+  current_step(messages)          → latest `当前步骤：…` snippet (or "")
+  summary(items)                  → (n_done, n_total)
+  is_complete(items)              → all done (or empty)
+
+Supported task-line shapes (all matched by `extract`):
+  - [ ] foo              ← bullet + open
+  - [x] foo              ← bullet + done
+  1. [✓] foo             ← numbered + done
+  2. [✓ 2026-05-16] foo  ← numbered + timestamped done, content after bracket
+  3. [✓ 已生成: foo]      ← numbered + done with description *inside* bracket
+  4. [D][P] foo          ← two marker groups (delegate + parallel), still open
+  5. [D] foo             ← non-standard marker "D" → open (not done)
+"""
+from __future__ import annotations
+import os, re
+from typing import Optional
+
+_DONE_CHARS = set("xX✓✔√☑")
+# Newline-insert before a bullet stuck to JSON debris (`{"content": "- [ ] …`).
+_GLUE_RE = re.compile(r"(?<!\n)((?:[-*+]|\d+\s*[.)、:）]) \[)")
+_BULLET_RE = re.compile(r"^\s*(?:[-*+]|\d+\s*[.)、:）])\s+")
+_BRACKET_RE = re.compile(r"\[([^\]]*)\]")
+# Strip `✓ ` / `x ` / timestamp prefix when bracket content is used as title.
+_INLINE_STRIP_RE = re.compile(
+    r"^[" + re.escape("".join(_DONE_CHARS)) + r"]\s*(?:\d{4}-\d{2}-\d{2}\s+\d{1,2}:\d{2}(?::\d{2})?\s*)?"
+)
+_DEBRIS_RE = re.compile(r'["\\<].*$')
+# Strip markdown emphasis since planbar renders rich.Text, not Markdown.
+_MD_EMPHASIS_RE = re.compile(
+    r"\*\*([^*\n]+)\*\*|\*([^*\n]+)\*|__([^_\n]+)__|_([^_\n]+)_|`([^`\n]+)`"
+)
+def _strip_md(s: str) -> str:
+    return _MD_EMPHASIS_RE.sub(lambda m: next(g for g in m.groups() if g is not None), s)
+
+
+def _has_done_glyph(marker: str) -> bool:
+    return any(c in _DONE_CHARS for c in marker)
+
+
+def extract(text: str) -> list[tuple[str, str]]:
+    if not text: return []
+    norm = text.replace("\\n", "\n") if "\\n" in text else text
+    norm = _GLUE_RE.sub(r"\n\1", norm)
+    found: dict[str, str] = {}
+    for line in norm.splitlines():
+        head = _BULLET_RE.match(line)
+        if not head: continue
+        rest = line[head.end():]
+        groups: list[str] = []
+        # Consume any number of consecutive `[...]` groups — covers `[D][P]`
+        # task-type chains as well as the plain `[ ]` / `[x]` single form.
+        while True:
+            b = _BRACKET_RE.match(rest)
+            if not b: break
+            groups.append(b.group(1))
+            rest = rest[b.end():]
+        if not groups: continue
+        is_done = any(_has_done_glyph(g) for g in groups)
+        inline = rest.strip()
+        if inline:
+            content = inline
+        elif is_done:
+            # `[✓ description]` shape — description lives inside the bracket
+            # next to the glyph. Strip the glyph + optional timestamp.
+            done_g = next(g for g in groups if _has_done_glyph(g))
+            content = _INLINE_STRIP_RE.sub("", done_g).strip()
+        else:
+            continue
+        k = _strip_md(_DEBRIS_RE.sub("", content).strip())
+        if not k: continue
+        status = "done" if is_done else "open"
+        # Same content seen twice — done wins over open.
+        if k not in found or status == "done":
+            found[k] = status
+    return list(found.items())
+
+
+def _stashed_plan_path(agent) -> str:
+    # First non-empty `working['in_plan_mode']` from (handler, agent).
+    for src in (getattr(agent, "handler", None), agent):
+        p = ((getattr(src, "working", None) or {}).get("in_plan_mode") or "").strip()
+        if p: return p
+    return ""
+
+
+def _resolve_stashed(p: str) -> Optional[str]:
+    if not p: return None
+    rel = p.lstrip("./\\")
+    cwd = os.getcwd()
+    for c in (p, os.path.join(cwd, "temp", rel), os.path.join(cwd, rel)):
+        if os.path.isfile(c) and os.path.getsize(c) > 0: return c
+    return None
+
+
+# Strict per-session discovery — scan this session's own messages only.
+_PATH_RE = re.compile(r"""((?:\.\/)?(?:temp\/)?plan_[A-Za-z0-9_\-]+\/plan\.md)""")
+
+
+def _slice(messages, start_idx: int):
+    if not messages: return []
+    if start_idx <= 0: return list(messages)
+    return list(messages)[start_idx:]
+
+
+def find_path_in_messages(messages, start_idx: int = 0) -> Optional[str]:
+    """Latest existing `plan_XXX/plan.md` referenced after `start_idx`.
+    Items can be `ChatMessage`-like (`.content`) or plain strings;
+    only paths that exist on disk are returned."""
+    sliced = _slice(messages, start_idx)
+    if not sliced: return None
+    for m in reversed(sliced):
+        text = getattr(m, "content", None)
+        if text is None: text = m if isinstance(m, str) else ""
+        if not text or "plan.md" not in text: continue
+        for hit in reversed(_PATH_RE.findall(text)):
+            p = _resolve_stashed(hit.strip().strip("\"'"))
+            if p: return p
+    return None
+
+
+# Prefer concise `<summary>` narrative over the long plan-item echo;
+# treat `❌ 当前步骤:` as "step done", not "current step".
+_SUMMARY_STEP_RE = re.compile(
+    r"<summary>[^<]*?当前步骤[:：]\s*([^<\n]{1,160})</summary>", re.DOTALL)
+_STEP_RE = re.compile(r"📌\s*当前步骤[:：]\s*([^\n。！!？?]{1,160})")
+_DONE_STEP_RE = re.compile(r"❌\s*当前步骤[:：]")
+
+
+def current_step(messages, start_idx: int = 0, max_len: int = 60) -> str:
+    """Latest `当前步骤：…` snippet; `<summary>` form preferred, `❌`-prefixed
+    skipped. Trimmed to `max_len` chars so it fits the 5-row plan card."""
+    sliced = _slice(messages, start_idx)
+    if not sliced: return ""
+
+    def _clean(s: str) -> str:
+        return _strip_md(re.sub(r"\s+", " ", s).strip().rstrip(" ：:—-"))
+
+    def _cap(s: str) -> str:
+        s = _clean(s)
+        if len(s) <= max_len: return s
+        return s[:max_len - 1].rstrip() + "…"
+
+    for m in reversed(sliced):
+        text = getattr(m, "content", None)
+        if text is None: text = m if isinstance(m, str) else ""
+        if not text or "当前步骤" not in text: continue
+        hits = _SUMMARY_STEP_RE.findall(text)
+        if hits: return _cap(hits[-1])
+        for raw in reversed(_STEP_RE.findall(text)):
+            if _DONE_STEP_RE.search(raw): continue
+            return _cap(raw)
+    return ""
+
+
+def is_active(agent, messages=None, start_idx: int = 0) -> bool:
+    """Plan mode is on. Primary: `working['in_plan_mode']`. Fallback:
+    a `plan_*/plan.md` referenced in this session's messages (no global scan)."""
+    if _stashed_plan_path(agent): return True
+    return find_path_in_messages(messages, start_idx) is not None
+
+
+def resolve_path(agent, messages=None, start_idx: int = 0) -> Optional[str]:
+    p = _resolve_stashed(_stashed_plan_path(agent))
+    if p: return p
+    return find_path_in_messages(messages, start_idx)
+
+
+def summary(items: list[tuple[str, str]]) -> tuple[int, int]:
+    return sum(1 for _, st in items if st == "done"), len(items)
+
+
+def is_complete(items: list[tuple[str, str]]) -> bool:
+    return not items or all(st == "done" for _, st in items)
diff --git a/frontends/tuiapp.py b/frontends/tuiapp.py
index 07345d2c..235134bc 100644
--- a/frontends/tuiapp.py
+++ b/frontends/tuiapp.py
@@ -86,8 +86,11 @@ def stash(match: re.Match[str]) -> str:
         placeholders.append(match.group(0))
         return f"\x00PH{len(placeholders) - 1}\x00"
 
-    safe = re.sub(r"`{4,}.*?`{4,}", stash, text, flags=re.DOTALL)
-    safe = re.sub(r"`{4,}[^`].*$", stash, safe, flags=re.DOTALL)
+    # Line-anchored fence matcher — see tuiapp_v2.fold_turns for rationale.
+    # Unanchored variant mis-paired backticks embedded in file_read output
+    # with later real fences, swallowing turn markers and ballooning the
+    # final "text" segment to MBs (1.85s markdown render on /continue).
+    safe = re.sub(r"^`{4,}.*?^`{4,}\n?", stash, text, flags=re.DOTALL | re.MULTILINE)
     parts = re.split(r"(\**LLM Running \(Turn \d+\) \.\.\.\**)", safe)
 
     def restore(part: str) -> str:
diff --git a/frontends/tuiapp_v2.py b/frontends/tuiapp_v2.py
index bfe072c9..2a8a78be 100644
--- a/frontends/tuiapp_v2.py
+++ b/frontends/tuiapp_v2.py
@@ -142,23 +142,26 @@ def _hint_terminal_capabilities() -> None:
     "Tip: /export clip 把上一条回复复制到剪贴板；/export all 给出完整日志路径。",
     "Tip: /branch [name] 从当前历史分裂出新会话，互不污染。",
     "Tip: ask_user 题目里写 [多选] 自动切到 SelectionList；任何 picker 都有 \"Type something\" 走自由输入。",
-    "Tip: plan 模式下的 todo 会自动渲染到顶部的 📋 Plan 面板，全部完成后自动消失。",
+    "Tip: plan 模式下的 todo 会渲染在消息区与输入框之间的 📋 Plan 卡片，完成后自动消失。",
 )
 
 
-def _random_tip() -> str:
+def _random_tip(exclude: str = "") -> str:
+    """Pick a tip distinct from `exclude` so rotation doesn't repeat."""
     import random
-    return random.choice(_TIPS)
+    pool = [t for t in _TIPS if t != exclude] or list(_TIPS)
+    return random.choice(pool)
 
 
-def _tip_line():
-    """Render `└ Tip: …` as a styled Rich Text. Used directly in compose()
-    so the first paint already includes the line — no post-mount race."""
+def _tip_line(text: str = ""):
+    """`└ Tip: …` as styled Rich Text; empty `text` → blank pulse line."""
     from rich.text import Text as _T
     t = _T()
+    if not text:
+        return t
     t.append("└ ", style="#6e7681")
     t.append("Tip: ", style="bold #6e7681")
-    t.append(_random_tip().removeprefix("Tip: "), style="#6e7681")
+    t.append(text.removeprefix("Tip: "), style="#6e7681")
     return t
 
 # Defensive cleaners for ask_user candidates. The model occasionally smuggles
@@ -239,8 +242,9 @@ def fold_turns(text: str) -> list[dict]:
     def stash(m):
         placeholders.append(m.group(0))
         return f"\x00PH{len(placeholders) - 1}\x00"
-    safe = re.sub(r"`{4,}.*?`{4,}", stash, text, flags=re.DOTALL)
-    safe = re.sub(r"`{4,}[^`].*$", stash, safe, flags=re.DOTALL)
+    # Line-anchored so backticks embedded in tool output (e.g. `N|\`\`\`\``
+    # gutter from file_read) don't pair with later real fences.
+    safe = re.sub(r"^`{4,}.*?^`{4,}\n?", stash, text, flags=re.DOTALL | re.MULTILINE)
     parts = re.split(r"(\**LLM Running \(Turn \d+\) \.\.\.\**)", safe)
     parts = [re.sub(r"\x00PH(\d+)\x00", lambda m: placeholders[int(m.group(1))], p) for p in parts]
     if len(parts) < 4:
@@ -592,6 +596,7 @@ def _align_md_renders(narrow_raw: str, wide_raw: str):
 import chatapp_common  # noqa: F401
 from chatapp_common import format_restore
 from btw_cmd import handle_frontend_command as btw_handle
+from review_cmd import handle as review_handle
 from continue_cmd import list_sessions as continue_list, extract_ui_messages as continue_extract
 from export_cmd import last_assistant_text, export_to_temp, wrap_for_clipboard
 
@@ -801,6 +806,20 @@ def _palette_from_resolved_vars(v: dict[str, str], dark: bool) -> dict[str, str]
     scrollbar-color-active: $ga-dim;
 }
 
+/* Plan/todo panel — fixed 5-row card between messages and composer.
+   `display: none` default so the empty post-compose frame doesn't flash;
+   renderer flips it on once items materialize. Fixed height (no scroll)
+   keeps layout stable; body truncates to 4 items + "+N more" footer. */
+#planbar {
+    display: none;
+    height: 5;
+    max-height: 5;
+    background: $ga-sel-bg;
+    padding: 0 1;
+    margin: 0 0 1 0;
+    border-left: thick $ga-green;
+}
+
 /* `└ Tip:` footer — one dim row, never grows. */
 #tipbar {
     height: 1;
@@ -924,16 +943,14 @@ class ChatMessage:
     _segment_widgets: list = field(default_factory=list, repr=False)
     _segment_sig: tuple = field(default=(), repr=False)
     _spinner_widget: Any = field(default=None, repr=False)
-    # Wall-clock start of streaming for this assistant turn — drives the spinner's
-    # `(Xm Ys · ↑ N.Nk · gerund...)` annotation. Set on first stream chunk.
+    # Stream start + token baselines so the spinner shows *this turn's* deltas.
     _stream_started_at: Optional[float] = field(default=None, repr=False)
-    # Token snapshot captured at stream start so the spinner can show *this turn's*
-    # input cost rather than the lifetime cumulative.
     _stream_baseline_input: int = field(default=0, repr=False)
-    # Per-segment rendered-Text cache keyed by (seg_content_hash, width). Survives
-    # fold-toggle because toggling visibility doesn't mutate any segment's content,
-    # so re-rendering the same Markdown twice is wasted work — this turns a ~60ms
-    # remount into a <5ms widget-rebuild even on long multi-turn messages.
+    _stream_baseline_output: int = field(default=0, repr=False)
+    # Frozen `(elapsed, last_in, last_out)` at done→True; keeps the post-turn
+    # card from ticking when the next turn shifts cost_tracker deltas.
+    _done_summary: Optional[tuple] = field(default=None, repr=False)
+    # Per-(seg_hash, width) Text cache; survives fold-toggle re-mounts.
     _seg_render_cache: dict = field(default_factory=dict, repr=False)
 
 
@@ -954,19 +971,22 @@ class AgentSession:
     input_pastes: dict[int, str] = field(default_factory=dict)
     input_paste_counter: int = 0
     buffer: str = ""
-    # Lazy-initialized in `_refresh_topbar` the first tick `status == "running"`
-    # is observed. Drives the topbar dot's heat-color ramp and the elapsed label.
+    # Drives topbar heat-color ramp + elapsed label; set on first running tick.
     _busy_since: Optional[float] = None
-    # When a run transitions running→idle we briefly flash the dot green; this
-    # holds the timestamp of that transition so the flash decays after ~5s.
+    # Stamps running→idle; topbar dot flashes green for ~5s after.
     _done_at: Optional[float] = None
-    # ask_user INTERRUPT events captured by the per-agent turn_end hook.
-    # Drained by the display thread when the assistant turn marks done.
+    # ask_user INTERRUPT events; drained by display thread on turn done.
     ask_user_events: Any = field(default_factory=lambda: queue.Queue())
-    # Set to {question: str} after user picks the free-text option in an
-    # ask_user picker. The next user submission gets intercepted into a
-    # 2-step `Ready to submit your answer?` confirmation.
+    # Pending `{question:str}` after the user picks free-text in an ask_user
+    # picker; next submission becomes a 2-step "Ready to submit?" confirm.
     free_text_pending: Optional[dict] = None
+    # Plan state: items + grace-period timers (3s farewell, 1.5s lost-grace).
+    plan_items: list = field(default_factory=list)
+    plan_complete_since: Optional[float] = None
+    plan_lost_since: Optional[float] = None
+    # Boundary between restored history (≤ idx) and this run (> idx);
+    # `/continue` bumps to `len(messages)` so old plan cards don't resurrect.
+    plan_scan_baseline: int = 0
 
 
 def default_agent_factory() -> Any:
@@ -991,6 +1011,7 @@ def default_agent_factory() -> Any:
     ("/stop",     "",                 "中止当前任务"),
     ("/llm",      "[n]",              "查看 / 切换模型"),
     ("/btw",      "<question>",       "side question — 不打断主 agent"),
+    ("/review",   "[request]",         "in-session 代码审查（直接输出报告）"),
     ("/continue", "[n|name]",         "列出 / 恢复历史会话"),
     ("/cost",     "[all]",            "显示当前会话 token 用量（all = 所有会话）"),
     ("/export",   "clip|<file>|all",  "导出最后回复"),
@@ -1362,9 +1383,29 @@ async def _on_paste(self, event: events.Paste) -> None:
             return
         if self._paste_file_from_clipboard():
             event.stop(); event.prevent_default(); return
+        # Git-bash / mintty fallback: PIL.ImageGrab can't return Image objects
+        # in that TTY env, but the OS clipboard does hold the file path the
+        # screenshot tool wrote. Treat a single-line, on-disk path as if the
+        # file grab had succeeded — same placeholder + `_pastes` entry.
+        if self._paste_file_from_text(event.text):
+            event.stop(); event.prevent_default(); return
         self._insert_paste_text(event.text)
         event.stop(); event.prevent_default()
 
+    def _paste_file_from_text(self, raw: str) -> bool:
+        if not raw: return False
+        path = raw.strip().strip('"').strip("'")
+        if "\n" in path or "\r" in path: return False
+        if len(path) > 1024: return False
+        if not os.path.isfile(path): return False
+        is_image = os.path.splitext(path)[1].lower() in _IMAGE_EXTS
+        self._paste_counter += 1
+        sid = self._paste_counter
+        self._pastes[sid] = path
+        marker = f"[Image #{sid}]" if is_image else f"[File #{sid}]"
+        self._insert_via_keyboard(marker)
+        return True
+
     async def _on_key(self, event: events.Key) -> None:
         # 1) command palette routing
         try:
@@ -1799,6 +1840,10 @@ def __init__(self, agent_factory: Optional[AgentFactory] = None) -> None:
         self.agent_factory: AgentFactory = agent_factory or default_agent_factory
         self.sessions: dict[int, AgentSession] = {}
         self.current_id: Optional[int] = None
+        # Wall-clock marker used by `/cost` to scope subagent log scans to
+        # logs touched after the TUI started — pre-launch leftovers shouldn't
+        # bleed into "this run's" total.
+        self._started_at: float = time.time()
         self._ids = count(1)
         self._suppress_palette_open = False
         self.fold_mode: bool = True
@@ -1834,17 +1879,21 @@ def __init__(self, agent_factory: Optional[AgentFactory] = None) -> None:
             "rename": self._cmd_rename,
             "branch": self._cmd_branch, "rewind": self._cmd_rewind, "clear": self._cmd_clear,
             "stop": self._cmd_stop, "llm": self._cmd_llm, "export": self._cmd_export,
-            "restore": self._cmd_restore, "btw": self._cmd_btw, "continue": self._cmd_continue,
-            "cost": self._cmd_cost,
+            "restore": self._cmd_restore, "btw": self._cmd_btw, "review": self._cmd_review,
+            "continue": self._cmd_continue, "cost": self._cmd_cost,
             "quit": self._cmd_quit, "exit": self._cmd_quit,
         }
         try:
             import cost_tracker; cost_tracker.install()
         except Exception:
             pass
-        # Best-effort: drop session_names entries whose log was rotated away
-        # (e.g. month-old logs the user deleted). Keeps the registry tidy so
-        # `/continue <name>` never resolves to a vanished file.
+        # Patch GenericAgent for /review in case chatapp_common didn't wire it.
+        try:
+            from agentmain import GenericAgent as _GA
+            import review_cmd; review_cmd.install(_GA)
+        except Exception:
+            pass
+        # Drop session_names entries pointing at rotated-away logs.
         try:
             import session_names; session_names.gc()
         except Exception:
@@ -1856,6 +1905,7 @@ def compose(self) -> ComposeResult:
             yield Static("", id="sidebar")
             with Vertical(id="main"):
                 yield VerticalScroll(id="messages")
+                yield Static("", id="planbar")
                 yield OptionList(id="palette")
                 yield InputArea(
                     "",
@@ -1869,15 +1919,19 @@ def compose(self) -> ComposeResult:
                 # Tip line sits inside #main so it doesn't compete for height
                 # with #body's 1fr. Content set at compose so the first frame
                 # already shows it.
-                yield Static(_tip_line(), id="tipbar")
+                yield Static(_tip_line(_random_tip()), id="tipbar")
         yield Static(render_bottombar(), id="bottombar")
 
     def on_mount(self) -> None:
         self.add_session("main")
         self._system("Welcome to GenericAgent TUI. 按 / 唤起命令面板，Ctrl+N 新建会话。")
+        # CSS `#planbar { display: none }` keeps it hidden by default —
+        # the renderer flips it on once items materialize.
         self.query_one("#input", InputArea).focus()
         self.set_interval(0.5, self._tick)
         self._patch_auto_scroll_for_selection()
+        self._start_plan_watcher()
+        self._start_tip_rotator()
         self._apply_responsive_layout()
         # Disable alternate scroll mode (?1007). Textual enables ?1006 SGR mouse but doesn't
         # turn off ?1007, which on macOS Terminal / iTerm2 makes the wheel emit both mouse
@@ -2837,6 +2891,46 @@ def worker():
 
         threading.Thread(target=worker, daemon=True, name="ga-tui-btw").start()
 
+    def _cmd_review(self, args, raw):
+        """`/review` via TUI's streaming path; the TUI intercepts slash commands
+        before `review_cmd.install`'s patch, so we render the prompt via
+        `review_cmd.handle` and submit it as a normal task with `/review ...`
+        kept as the visible user message."""
+        body = (raw or "").strip()
+        if body == "/review":
+            body = ""
+        elif body.startswith("/review ") or body.startswith("/review\t"):
+            body = body[len("/review"):].strip()
+        else:
+            body = " ".join(args).strip()
+        sess = self.current
+        if body in ("help", "?", "-h", "--help"):
+            try:
+                dq = queue.Queue()
+                rendered = review_handle(sess.agent, body, dq)
+                try:
+                    item = dq.get_nowait()
+                    self._system(str(item.get("done") or ""))
+                except queue.Empty:
+                    if rendered:
+                        self._system(rendered)
+            except Exception as e:
+                self._system(f"❌ /review help 失败: {type(e).__name__}: {e}")
+            return
+        if sess.status == "running":
+            self._system(f"#{sess.agent_id} 正在跑，/stop 后再发。")
+            return
+        try:
+            prompt = review_handle(sess.agent, body, queue.Queue())
+        except Exception as e:
+            self._system(f"❌ /review 初始化失败: {type(e).__name__}: {e}")
+            return
+        if not prompt:
+            self._system("❌ /review 未生成审查提示。")
+            return
+        display_text = raw.strip() if (raw or "").strip() else "/review"
+        self.submit_user_message(prompt, display_text=display_text)
+
     def _cmd_continue(self, args, raw):
         sess = self.current
         m = re.match(r"/continue\s+(\S.*?)\s*$", (raw or "").strip())
@@ -2908,8 +3002,18 @@ def _do_continue_restore(self, path: str) -> str:
                 pass
         def _finish():
             sess.messages.clear()
+            # Plan state belongs to the *previous* conversation. Clearing it
+            # along with messages stops the planbar from leaking stale items
+            # (`Plan (3/7)` from #4 qxs) into the freshly-restored session.
+            sess.plan_items = []
+            sess.plan_complete_since = None
+            sess.plan_lost_since = None
+            self._plan_mtime.pop(sess.agent_id, None)
             for h in continue_extract(path):
                 sess.messages.append(ChatMessage(role=h["role"], content=h["content"]))
+            # Baseline past restored history so the scanner ignores the prior
+            # session's plan.md; only re-shows on a fresh enter_plan_mode.
+            sess.plan_scan_baseline = len(sess.messages)
             try:
                 import session_names
                 nm = session_names.name_for(path)
@@ -2966,21 +3070,43 @@ def _section(sid: int, sess, t) -> list[str]:
                     f"{_k(t.cache_create)} created  ·  "
                     f"{t.cache_hit_rate():.1f}% hit"
                 )
-            ctx = cost_tracker.context_limit_for(model)
-            if ctx and t.last_input > 0:
-                used = t.last_input
-                pct_left = max(0.0, (ctx - used) / ctx * 100.0)
+            try: backend = sess.agent.llmclient.backend
+            except Exception: backend = None
+            cap = cost_tracker.context_window_chars(backend) if backend else 0
+            used = cost_tracker.current_input_chars(backend) if backend else 0
+            if cap > 0:
+                pct_left = max(0.0, (cap - used) / cap * 100.0)
                 ls.append(
                     f"  Context window:  {pct_left:>5.0f}% left  "
-                    f"({_k(used)} used / {_k(ctx)})"
+                    f"({_k(used)} chars used / {_k(cap)} cap)"
                 )
             ls.append(f"  Requests:        {t.requests:>7}")
             return ls
 
+        # Scope subagent logs to this TUI run so prior-session logs don't bleed in.
+        try: sub = cost_tracker.scan_subagent_logs(since=getattr(self, "_started_at", 0.0))
+        except Exception: sub = None
+
+        def _sub_section() -> list[str]:
+            if not sub or sub.total_tokens() == 0: return []
+            ls = ["", f"subagents (扫描 temp/*/stdout.log)"]
+            ls.append(
+                f"  Token usage:     {_k(sub.total_tokens()):>7} total  "
+                f"({_k(sub.total_input_side())} input + {_k(sub.output)} output)"
+            )
+            if sub.cache_read or sub.cache_create:
+                ls.append(
+                    f"  Cache:           {_k(sub.cache_read):>7} read  ·  "
+                    f"{_k(sub.cache_create)} created  ·  "
+                    f"{sub.cache_hit_rate():.1f}% hit"
+                )
+            ls.append(f"  Requests:        {sub.requests:>7}")
+            return ls
+
         lines: list[str] = []
         if show_all:
             trackers = cost_tracker.all_trackers()
-            if not trackers:
+            if not trackers and not (sub and sub.total_tokens()):
                 lines = ["✦ Token usage", "  (尚无任何 LLM 调用记录)"]
             else:
                 # Resolve each thread back to a session if we still know it; otherwise
@@ -3004,12 +3130,14 @@ def _section(sid: int, sess, t) -> list[str]:
                             f"({_k(t.total_input_side())} input + {_k(t.output)} output)"
                         )
                         lines.append(f"  Requests:        {t.requests:>7}")
+                lines += _sub_section()
         else:
             sess = self.current
             tname = sess.thread.name if sess.thread else f"ga-tui-agent-{sess.agent_id}"
             t = cost_tracker.get(tname)
             lines.append("✦ Token usage")
             lines += _section(sess.agent_id, sess, t)
+            lines += _sub_section()
         self._system("\n".join(lines))
 
     def _cmd_export(self, args, raw):
@@ -3122,12 +3250,9 @@ def on_unmount(self) -> None:
         self._reset_terminal_title()
 
     # ---------------- agent task + stream ----------------
-    def submit_user_message(self, text: str, images: Optional[list[str]] = None) -> int:
+    def submit_user_message(self, text: str, images: Optional[list[str]] = None, display_text: Optional[str] = None) -> int:
         sess = self.current
-        # Free-text ask_user interception: route through the 2-step
-        # `Ready to submit your answer?` confirmation card before letting
-        # the agent see the answer. Only triggers when the picker armed
-        # `sess.free_text_pending`; the rest of the submit path is unchanged.
+        # Free-text ask_user answers go through a 2-step submit-confirm card.
         if self._maybe_intercept_free_text(sess, text):
             return -1
         if sess.status == "running":
@@ -3139,7 +3264,8 @@ def submit_user_message(self, text: str, images: Optional[list[str]] = None) ->
         sess.buffer = ""
         sess.status = "running"
         image_paths = list(images or [])
-        sess.messages.append(ChatMessage("user", text, image_paths=image_paths))
+        visible_text = text if display_text is None else display_text
+        sess.messages.append(ChatMessage("user", visible_text, image_paths=image_paths))
         sess.messages.append(ChatMessage("assistant", "", task_id=tid, done=False))
         self._refresh_all()
         try:
@@ -3183,12 +3309,12 @@ def _on_stream(self, agent_id, task_id, text, done):
             s.status = "idle"
             s.current_display_queue = None
         self._update_assistant(agent_id, text, task_id=task_id, done=done, refresh_chrome=True)
+        # End-of-turn re-parse only; mid-stream `[...]` fragments would flash.
         if done:
+            self._update_plan_state(s, text)
             self._drain_ask_user_events(s)
 
-    # `[多选]` / `[multi]` / `select all` in the question switches the picker to
-    # a multi-select widget. The flag is intentionally heuristic so existing
-    # ask_user calls (no schema change in core) can opt in by phrasing alone.
+    # Phrasing-based opt-in for multi-select picker (no core schema change).
     _MULTI_RE = re.compile(r"\[?(?:多选|multi(?:[-_ ]?select)?|select all)\]?", re.IGNORECASE)
 
     def _drain_ask_user_events(self, sess: AgentSession) -> None:
@@ -3379,6 +3505,172 @@ def _update_assistant(self, agent_id, text, *, task_id=None, done=True, refresh_
             self._refresh_topbar()
         self._ensure_spinner()
 
+    # ---------------- Plan/todo panel ----------------
+    # State machine (graces absorb mid-stream parse misses / let final tally read):
+    #   hidden → active(n_done/n_total) → complete(n/n) → [3s grace] → hidden
+    #   active/complete → empty → [1.5s grace] → hidden
+    _PLAN_GRACE_SEC = 3.0
+    _PLAN_LOST_GRACE_SEC = 1.5
+
+    def _update_plan_state(self, sess: AgentSession, _stream_text: str = "") -> None:
+        import plan_state
+        prev = sess.plan_items
+        # Detect plan mode: `working['in_plan_mode']` first, fallback to per-
+        # session message scan for a `plan_*/plan.md` reference. Strictly
+        # per-session via `plan_scan_baseline` to avoid /continue bleed.
+        new_items: list = []
+        msgs = sess.messages
+        base = sess.plan_scan_baseline
+        if plan_state.is_active(sess.agent, messages=msgs, start_idx=base):
+            path = plan_state.resolve_path(sess.agent, messages=msgs, start_idx=base)
+            if path:
+                try:
+                    with open(path, encoding="utf-8", errors="replace") as f:
+                        new_items = plan_state.extract(f.read())
+                except OSError:
+                    new_items = []
+        now_c = plan_state.is_complete(new_items) and new_items
+        was_c = plan_state.is_complete(prev) and prev
+        if now_c and not was_c: sess.plan_complete_since = time.time()
+        elif not now_c:         sess.plan_complete_since = None
+        if not new_items and prev:
+            sess.plan_lost_since = time.time()
+        elif new_items:
+            sess.plan_lost_since = None
+            sess.plan_items = new_items
+        if sess.agent_id == self.current_id:
+            self._refresh_planbar()
+
+    def _refresh_planbar(self) -> None:
+        try: bar = self.query_one("#planbar", Static)
+        except Exception: return
+        sess = self.sessions.get(self.current_id) if self.current_id is not None else None
+        items = sess.plan_items if sess else []
+        if sess and sess.plan_lost_since is not None:
+            if time.time() - sess.plan_lost_since >= self._PLAN_LOST_GRACE_SEC:
+                sess.plan_items = []; sess.plan_lost_since = None; items = []
+        import plan_state
+        msgs = sess.messages if sess else None
+        base = sess.plan_scan_baseline if sess else 0
+        # Plan-mode armed but no items yet → placeholder (covers the
+        # enter_plan_mode → first plan.md write gap).
+        if not items:
+            if sess and plan_state.is_active(sess.agent, messages=msgs, start_idx=base):
+                self._render_planbar_placeholder(bar, sess)
+                return
+            self._set_planbar_visible(bar, False); return
+        n_done, n_total = plan_state.summary(items)
+        complete = plan_state.is_complete(items)
+        if complete and sess and sess.plan_complete_since is not None:
+            if time.time() - sess.plan_complete_since >= self._PLAN_GRACE_SEC:
+                self._set_planbar_visible(bar, False); return
+        # 5-row budget: header(1) + step(0/1) + tasks(N) + overflow(0/1).
+        step = plan_state.current_step(msgs, start_idx=base)
+        budget = 4 - (1 if step else 0)
+        ordered = [(c, st) for c, st in items if st != "done"] + \
+                  [(c, st) for c, st in items if st == "done"]
+        body_lines = budget - 1 if len(ordered) > budget else budget
+        shown = ordered[:body_lines]
+        overflow = max(0, len(ordered) - body_lines)
+        sig = (tuple(shown), overflow, step, bool(complete and sess and sess.plan_complete_since))
+        if getattr(bar, "_plan_sig", None) == sig and bar.display: return
+        bar._plan_sig = sig
+        body = Text()
+        head = f"✓ Plan complete ({n_total}/{n_total})\n" if complete else f"📋 Plan ({n_done}/{n_total})\n"
+        body.append(head, style=f"bold {C_GREEN}")
+        if step:
+            body.append("  ▸ ", style=C_GREEN)
+            body.append(step[:120] + "\n", style=C_MUTED)
+        for c, st in shown:
+            if st == "done": body.append("  ✔ ", style=C_GREEN); body.append(c + "\n", style=C_DIM)
+            else:            body.append("  ☐ ", style=C_DIM);  body.append(c + "\n", style=C_FG)
+        if overflow:
+            body.append(f"  ⋮ +{overflow} more", style=C_DIM)
+        bar.update(body)
+        self._set_planbar_visible(bar, True)
+
+    def _render_planbar_placeholder(self, bar: Static, sess: AgentSession) -> None:
+        # Placeholder for armed-but-empty plan mode (pre-first plan.md write).
+        import plan_state
+        base = sess.plan_scan_baseline
+        path = (plan_state._stashed_plan_path(sess.agent)
+                or plan_state.find_path_in_messages(sess.messages, start_idx=base)
+                or "")
+        hint = "/".join(path.replace("\\", "/").rstrip("/").split("/")[-2:]) if path else "plan.md"
+        step = plan_state.current_step(sess.messages, start_idx=base)
+        sig = ("__placeholder__", hint, step)
+        if getattr(bar, "_plan_sig", None) == sig and bar.display: return
+        bar._plan_sig = sig
+        body = Text()
+        body.append("📋 Plan 模式已激活\n", style=f"bold {C_GREEN}")
+        if step:
+            body.append("  ▸ ", style=C_GREEN)
+            body.append(step[:120] + "\n", style=C_MUTED)
+        body.append(f"  等待写入 {hint} …", style=C_DIM)
+        bar.update(body)
+        self._set_planbar_visible(bar, True)
+
+    def _set_planbar_visible(self, bar: Static, visible: bool) -> None:
+        # Repaint only on show→hide transition; idle ticks no-op.
+        if not visible:
+            if not bar.display: return
+            bar.display = False
+            bar.update(Text())
+            bar._plan_sig = None
+            return
+        if not bar.display: bar.display = True
+
+    def _start_plan_watcher(self) -> None:
+        if getattr(self, "_plan_timer", None) is not None: return
+        self._plan_mtime: dict = {}
+        try: self._plan_timer = self.set_interval(1.0, self._poll_plan_files)
+        except Exception: pass
+
+    def _poll_plan_files(self) -> None:
+        # Poll only the visible session — background sessions don't paint planbar.
+        import plan_state
+        sess = self.sessions.get(self.current_id) if self.current_id is not None else None
+        if sess is None: return
+        msgs = sess.messages
+        base = sess.plan_scan_baseline
+        if not plan_state.is_active(sess.agent, messages=msgs, start_idx=base):
+            self._refresh_planbar(); return
+        path = plan_state.resolve_path(sess.agent, messages=msgs, start_idx=base)
+        if not path:
+            self._refresh_planbar(); return
+        try: mtime = os.path.getmtime(path)
+        except OSError:
+            self._refresh_planbar(); return
+        if self._plan_mtime.get(sess.agent_id) != mtime:
+            self._plan_mtime[sess.agent_id] = mtime
+            self._update_plan_state(sess); return
+        self._refresh_planbar()  # tick grace timers
+
+    # ---------------- Tip rotation ----------------
+    # 12s show → 1s blank → next tip.
+    _TIP_SHOW_SEC = 12.0
+    _TIP_BLANK_SEC = 1.0
+
+    def _start_tip_rotator(self) -> None:
+        if getattr(self, "_tip_timer", None) is not None: return
+        self._tip_current: str = ""
+        try: self._tip_timer = self.set_interval(self._TIP_SHOW_SEC, self._rotate_tip)
+        except Exception: pass
+
+    def _rotate_tip(self) -> None:
+        try: bar = self.query_one("#tipbar", Static)
+        except Exception: return
+        bar.update(_tip_line(""))  # blank pulse
+        nxt = _random_tip(exclude=self._tip_current)
+        self._tip_current = nxt
+        try: self.set_timer(self._TIP_BLANK_SEC, lambda: self._show_tip(nxt))
+        except Exception: self._show_tip(nxt)
+
+    def _show_tip(self, tip: str) -> None:
+        try: bar = self.query_one("#tipbar", Static)
+        except Exception: return
+        bar.update(_tip_line(tip))
+
     # ---------------- UI refresh ----------------
     def _system(self, text: str) -> None:
         if self.current_id is None: return
@@ -3391,6 +3683,7 @@ def _refresh_all(self):
         self._refresh_topbar()
         self._refresh_sidebar()
         self._refresh_messages()
+        self._refresh_planbar()
         self._ensure_spinner()
 
     def _swap_input_for_session(self) -> None:
@@ -3655,23 +3948,23 @@ def cached_render(content: str) -> "_MdRender":
 
     _SPINNER_FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
 
-    # Easter-egg gerunds rotated through the spinner annotation — keeps the
-    # streaming wait feeling alive rather than dead-frozen.
+    # Spinner gerund pool (stable per-message via id-hash; separate from _DONE_GERUNDS).
     _SPINNER_GERUNDS = (
         "Pondering", "Reticulating", "Sleuthing", "Hatching", "Pouncing",
         "Brewing", "Sharpening", "Untangling", "Compiling", "Unraveling",
         "Distilling", "Calibrating", "Marinating", "Conjuring", "Foraging",
         "Spelunking", "Synthesizing", "Refactoring thoughts", "Tracing breadcrumbs",
         "Following the rabbit hole",
+        "Routing", "Threading", "Polling", "Spinning", "Hooking",
+        "Patching", "Caching", "Yielding", "Hydrating", "Folding",
+        "Streaming", "Resolving", "Reaping", "Tuning",
     )
 
     def _spinner_glyph(self) -> str:
         return self._SPINNER_FRAMES[self._spinner_frame % len(self._SPINNER_FRAMES)]
 
     def _spinner_gerund(self, m) -> str:
-        # Stable per-message: rotate by message identity hash so the gerund
-        # doesn't strobe with every spinner frame. ID-keyed avoids ChatMessage
-        # __hash__ requirements and survives content mutation.
+        # ID-hashed → stable per-message; survives content mutation.
         idx = (id(m) // 16) % len(self._SPINNER_GERUNDS)
         return self._SPINNER_GERUNDS[idx]
 
@@ -3684,33 +3977,94 @@ def _humanize_tokens(n: int) -> str:
         return f"{n / 1_000_000.0:.2f}M"
 
     def _spinner_annotation(self, m) -> Text:
-        """Render `⠋ Gerund... (Xm Ys · ↑ N.Nk tokens)` for a streaming message.
-        The gerund hue shifts with elapsed + token deltas (see _gerund_color)."""
+        """Render `⠋ Gerund… (Xm Ys · ↑ N · ↓ M)` for a streaming message.
+        ↑/↓ are the latest LLM call's prompt / completion sizes, gated on
+        cumulative counters moving past the baselines captured at stream start
+        (otherwise the prior turn's tail values leak in on prompt submit).
+        """
         out = Text()
         elapsed = int(time.time() - m._stream_started_at) if m._stream_started_at else 0
-        delta_in = 0
-        try:
-            import cost_tracker
-            sess = self.sessions.get(self.current_id)
-            tname = sess.thread.name if sess and sess.thread else f"ga-tui-agent-{self.current_id}"
-            t = cost_tracker.get(tname)
-            delta_in = max(0, t.input + t.cache_create + t.cache_read - m._stream_baseline_input)
-        except Exception:
-            pass
-        gerund_style = _gerund_color(elapsed, delta_in)
+        last_in, last_out = self._live_call_tokens(m)
+        gerund_style = _gerund_color(elapsed, last_in)
         out.append(self._spinner_glyph(), style=gerund_style)
         out.append(f" {self._spinner_gerund(m)}…", style=gerund_style)
         bits = []
         if m._stream_started_at:
             bits.append(_fmt_elapsed(elapsed))
-        if delta_in > 0:
-            bits.append(f"↑ {self._humanize_tokens(delta_in)} tokens")
+        if last_in > 0 or last_out > 0:
+            bits.append(f"↑ {self._humanize_tokens(last_in)} · ↓ {self._humanize_tokens(last_out)}")
         if bits:
             out.append("  (", style=C_DIM)
             out.append(" · ".join(bits), style=C_DIM)
             out.append(")", style=C_DIM)
         return out
 
+    def _live_call_tokens(self, m) -> tuple:
+        """`(last_in, last_out)` for this turn, gated on cumulative deltas past
+        the per-message baselines. Returns zeros until the new turn moves
+        the counters. Shared by spinner + done-card."""
+        last_in = last_out = 0
+        try:
+            import cost_tracker
+            sess = self.sessions.get(self.current_id)
+            tname = sess.thread.name if sess and sess.thread else f"ga-tui-agent-{self.current_id}"
+            t = cost_tracker.get(tname)
+            cum_in = t.input + t.cache_create + t.cache_read
+            cum_out = t.output
+            if cum_in > m._stream_baseline_input: last_in = t.last_input
+            if cum_out > m._stream_baseline_output: last_out = t.last_output
+        except Exception:
+            pass
+        return last_in, last_out
+
+    # Settled-state braille pairs with the spinner frames (⠋…⠏ → ⠿).
+    _DONE_GLYPH = "⠿"
+
+    # Past-tense pool for the post-turn card; reads "{Verb} for Xm Ys".
+    _DONE_GERUNDS = (
+        "Churned", "Ruminated", "Brewed", "Cooked", "Marinated", "Percolated",
+        "Distilled", "Crystallized", "Synthesized", "Sharpened", "Conjured",
+        "Pondered", "Spelunked", "Untangled", "Foraged", "Hatched", "Pounced",
+        "Sleuthed", "Unraveled", "Calibrated", "Mused", "Schemed", "Tinkered",
+        "Forged", "Simmered", "Steeped",
+        "Threaded", "Folded", "Patched", "Streamed", "Cached", "Hooked",
+        "Routed", "Resolved", "Yielded", "Hydrated", "Reaped", "Tuned",
+        "Plotted", "Reviewed", "Audited", "Verified", "Adjudicated",
+        "Conducted", "Orchestrated",
+        "Mapped", "Reduced", "Dispatched",
+        "Recalled", "Stashed", "Indexed",
+    )
+
+    def _done_gerund(self, m) -> str:
+        # Stable per-message — id-hash so re-mount (theme / resize / fold) keeps
+        # the verb; spinner uses a separate pool so live/settled never collide.
+        idx = (id(m) // 16) % len(self._DONE_GERUNDS)
+        return self._DONE_GERUNDS[idx]
+
+    def _done_annotation(self, m) -> Text:
+        """Render `⠿ {Verb} for Xm Ys · ↑ N · ↓ M` after a turn finishes.
+        Numbers frozen via `_done_summary` so re-mounts / next turn don't
+        shift the line."""
+        elapsed, last_in, last_out = m._done_summary or (0, 0, 0)
+        verb = self._done_gerund(m)
+        out = Text()
+        out.append(self._DONE_GLYPH + " ", style=C_GREEN)
+        out.append(f"{verb} for {_fmt_elapsed(int(elapsed))}", style=C_DIM)
+        if last_in > 0 or last_out > 0:
+            out.append("  · ", style=C_DIM)
+            out.append(f"↑ {self._humanize_tokens(last_in)} · ↓ {self._humanize_tokens(last_out)}",
+                       style=C_DIM)
+        return out
+
+    def _capture_done_summary(self, m) -> None:
+        """Freeze `(elapsed, last_in, last_out)` once when an assistant message
+        transitions done→True. Idempotent — repeat calls are no-ops so re-mounts
+        and stream-update passes won't overwrite the snapshot."""
+        if m._done_summary is not None or not m.done: return
+        elapsed = (time.time() - m._stream_started_at) if m._stream_started_at else 0.0
+        last_in, last_out = self._live_call_tokens(m)
+        m._done_summary = (elapsed, last_in, last_out)
+
     def _has_streaming(self) -> bool:
         if self.current_id is None:
             return False
@@ -3742,8 +4096,8 @@ def _spinner_tick(self) -> None:
 
     def _mark_stream_start(self, m) -> None:
         """Lazily timestamp a streaming message so the spinner can show elapsed/tokens.
-        Snapshots the current input-side token total as a baseline so the displayed
-        delta reflects *this* turn only."""
+        Snapshots both input-side and output-side token totals as baselines so
+        the spinner's `↑ N · ↓ M` reflects *this* turn only."""
         m._stream_started_at = time.time()
         try:
             import cost_tracker
@@ -3751,8 +4105,10 @@ def _mark_stream_start(self, m) -> None:
             tname = sess.thread.name if sess and sess.thread else f"ga-tui-agent-{self.current_id}"
             t = cost_tracker.get(tname)
             m._stream_baseline_input = t.input + t.cache_create + t.cache_read
+            m._stream_baseline_output = t.output
         except Exception:
             m._stream_baseline_input = 0
+            m._stream_baseline_output = 0
 
     @staticmethod
     def _segment_sig(segs: list[tuple]) -> tuple:
@@ -3833,13 +4189,31 @@ def _mount_assistant_segments(self, container, m: ChatMessage, segs: list[tuple]
         self._sync_spinner_widget(container, m, anchor)
 
     def _sync_spinner_widget(self, container, m: ChatMessage, anchor) -> None:
-        """Spinner is a tiny dedicated Static after segment widgets — outside Markdown
-        so unclosed code fences / paragraph trimming can't eat it. Mounted iff streaming."""
+        """Tiny dedicated Static after segment widgets — outside Markdown so
+        unclosed code fences / paragraph trimming can't eat it. While streaming
+        shows the spinner annotation; once `m.done` flips True, the same widget
+        becomes the post-turn `⠿ Churned for Xm Ys` card (frozen via
+        `_capture_done_summary`)."""
         if m.done:
-            if m._spinner_widget is not None:
-                try: m._spinner_widget.remove()
+            # `_stream_started_at` is the marker that this message was actually
+            # streamed in this TUI session. Restored /continue history flips
+            # done=True without ever streaming, so skip the card there — a
+            # "⠿ Churned for 0s" badge under every archived turn is just noise.
+            if m._stream_started_at is None:
+                if m._spinner_widget is not None:
+                    try: m._spinner_widget.remove()
+                    except Exception: pass
+                    m._spinner_widget = None
+                return
+            self._capture_done_summary(m)
+            if m._spinner_widget is None:
+                w = Static(self._done_annotation(m), classes="msg spinner")
+                if anchor is None: container.mount(w)
+                else:               container.mount(w, after=anchor)
+                m._spinner_widget = w
+            else:
+                try: m._spinner_widget.update(self._done_annotation(m))
                 except Exception: pass
-                m._spinner_widget = None
             return
         if m._spinner_widget is None:
             if m._stream_started_at is None:
@@ -3878,9 +4252,10 @@ def _stream_update_assistant(self, m: ChatMessage) -> None:
                 last_widget._ga_render = None
                 last_widget.update(Text(last_text, style=C_FG))
             if m.done and m._spinner_widget is not None:
-                try: m._spinner_widget.remove()
+                # Convert the live spinner into the post-turn ⠿ card in place.
+                self._capture_done_summary(m)
+                try: m._spinner_widget.update(self._done_annotation(m))
                 except Exception: pass
-                m._spinner_widget = None
             return
         self._remount_assistant_message(m)
 
@@ -3925,8 +4300,37 @@ def build_arg_parser() -> argparse.ArgumentParser:
     return argparse.ArgumentParser(description="GenericAgent TUI v2 (refined visual style)")
 
 
+def _warn_mintty():
+    """Warn only for direct Git Bash/mintty, not Git Bash inside Windows Terminal."""
+    if sys.platform != 'win32':
+        return
+    # Direct Git Bash uses mintty. Git Bash hosted by Windows Terminal still sets
+    # MSYSTEM, but has WT_SESSION and renders Textual correctly, so do not block it.
+    term_prog = os.environ.get('TERM_PROGRAM', '').lower()
+    wt_session = os.environ.get('WT_SESSION', '')
+    direct_mintty = term_prog == 'mintty' and not wt_session
+    if direct_mintty:
+        print(
+            "\033[33m[ga-tui] WARNING: direct Git Bash/mintty detected.\033[0m\n"
+            "  Textual TUI requires a modern terminal with full VT/xterm support.\n"
+            "  Direct mintty can cause rendering issues (blank screen, garbled output).\n"
+            "\n"
+            "  Recommended alternatives:\n"
+            "    - Windows Terminal Git Bash: wt -p \"Git Bash\" python frontends/tuiapp_v2.py\n"
+            "    - Windows Terminal:          wt python frontends/tuiapp_v2.py\n"
+            "    - CMD:                       python frontends\\tuiapp_v2.py\n"
+            "    - PowerShell:                python frontends/tuiapp_v2.py\n"
+            "\n"
+            "  To continue anyway, set GA_TUI_FORCE=1",
+            file=sys.stderr,
+        )
+        if not os.environ.get('GA_TUI_FORCE'):
+            raise SystemExit(1)
+
+
 def main(argv: Optional[list[str]] = None) -> int:
     build_arg_parser().parse_args(argv)
+    _warn_mintty()
     GenericAgentTUI().run()
     return 0
 
diff --git a/llmcore.py b/llmcore.py
index 3ceaa0c1..2e239d32 100644
--- a/llmcore.py
+++ b/llmcore.py
@@ -297,12 +297,14 @@ def _record_usage(usage, api_mode):
     if not usage: return
     if api_mode == 'responses':
         cached = (usage.get("input_tokens_details") or {}).get("cached_tokens", 0)
-        inp = usage.get("input_tokens", 0)
+        inp = usage.get("input_tokens", 0); out = usage.get("output_tokens", 0)
         print(f"[Cache] input={inp} cached={cached}")
+        if out: print(f"[Output] tokens={out}")
     elif api_mode == 'chat_completions':
         cached = (usage.get("prompt_tokens_details") or {}).get("cached_tokens", 0)
-        inp = usage.get("prompt_tokens", 0)
+        inp = usage.get("prompt_tokens", 0); out = usage.get("completion_tokens", 0)
         print(f"[Cache] input={inp} cached={cached}")
+        if out: print(f"[Output] tokens={out}")
     elif api_mode == 'messages':
         ci, cr, inp = usage.get("cache_creation_input_tokens", 0), usage.get("cache_read_input_tokens", 0), usage.get("input_tokens", 0)
         print(f"[Cache] input={inp} creation={ci} read={cr}")
diff --git a/memory/review_sop.md b/memory/review_sop.md
new file mode 100644
index 00000000..f3d719d0
--- /dev/null
+++ b/memory/review_sop.md
@@ -0,0 +1,169 @@
+# Review Mode SOP
+
+> In-session adversarial code reviewer。用 `/review` 触发,主 agent 在当前对话内
+> 拉起评审,报告直接 echo 到对话,**不开 subagent / 不落盘 / 不打 sentinel**。
+
+---
+
+## 一、何时使用
+
+用户输入 `/review` 命令,或自然语言要求"code review"时启用。
+典型用例:作者刚写完一段代码 → `/review` 对自己的改动做对抗性 review。
+
+---
+
+## 二、快速启动
+
+| 命令 | 行为 |
+|---|---|
+| `/review` | 默认审本次 uncommitted 改动(主 agent 跑 `git diff --stat HEAD` + `git diff HEAD`) |
+| `/review <自然语言请求>` | 按描述的范围去审(可指定文件 / 目录 / 任务) |
+| `/review help` | 显示用法 |
+
+**非 git 仓库**:主 agent 提示用户在下一句 `/review` 塞入具体路径或范围,本轮结束。
+
+---
+
+## 三、入口文件
+
+```
+任意前端 (TUI / Streamlit / wechat / desktop)
+   └─ frontends/review_cmd.py     ← 命令分发,剥 "/review" 前缀,注入 user_request
+       └─ memory/review_sop/review_inline_prompt.txt   ← 完整 in-session 协议
+           └─ memory/code_review_principles.md         ← 15 条好代码原则
+```
+
+- `review_cmd.py:install()` —— monkey-patch `GenericAgent._handle_slash_cmd`,统一接管 `/review`
+- `review_cmd.py:_render_prompt()` —— 加载 prompt 模板,注入 `{user_request}` + `{ga_root}`
+
+---
+
+## 四、三条铁律(reviewer 顶部硬约束,不可违反)
+
+1. **Review-only 只读评审** —— 评审与报告而已。**禁止**修改源文件、调
+   file_write / file_patch / code_run 改业务代码、在产出里写"我接下来去修一下"
+   或暗示要动手。
+2. **Challenge the approach, 不仅找 bug** —— 先问"这条路本身对不对?"再问
+   "实现有没有 bug?":挖隐含假设、评估真实环境故障模式(Windows 路径 / 代理失活 /
+   并发写 / UTF-8 边界 / token 预算耗尽)。
+3. **报告输出完即结束** —— 不复述用户目标、不做 meta 评论、不承诺 follow-up;
+   报告 markdown 直接 echo 到对话,**不落盘 review.md、不打 `[ROUND END]`**。
+
+---
+
+## 五、工作流(5 步,顺序走)
+
+### 步骤 1:必读底料
+
+`file_read("memory/code_review_principles.md")` —— 15 条好代码原则,**每条 finding 必须
+能映射到其中一条**。
+
+### 步骤 2:锁定审阅范围
+
+| 用户输入 | 范围 |
+|---|---|
+| 点名了文件 / 目录 | 审那些 |
+| 描述了任务范围 | `code_run` 跑 `git status -s` + `git diff --stat HEAD` + `git diff HEAD` |
+| 空 / 模糊 | 默认审本次 uncommitted 改动 |
+| 非 git 仓库 | 提示用户塞路径,本轮结束 |
+
+**先把范围列出来发给用户确认**,再开始 `file_read`。
+
+### 步骤 3:逐文件 file_read
+
+超过 800 行分段读。优先看 diff 涉及的行,再看上下文与接口调用方。
+
+### 步骤 4:回答 Q1-Q4 对抗性 framing
+
+- **Q1: Is this the right approach?** — 有没有更简单 / 更标准 / 更安全的实现路径?
+- **Q2: What hidden dependencies could fail?** — OS / shell / 网络 / 并发 / 第三方 API 任一失效?
+- **Q3: What edge / hostile input breaks it?** — 空值、UTF-8、Windows 路径、超长输入、过期 token。
+- **Q4: Is the failure mode observable & recoverable?** — 仅看日志能不能定位?能不能不动手就恢复?
+
+### 步骤 5:列 P0~P3 findings
+
+遵守 §七 防误报八规则 + §八 措辞八规范。提交前过自检清单(§九)。
+
+---
+
+## 六、Severity / Verdict 速查
+
+| Level | 定义 | 例子 |
+|---|---|---|
+| **P0** | 阻塞:破坏正确性 / 丢数据 / 安全漏洞 / 不可逆故障 | 路径穿越、SQL 注入、密钥落日志、并发竞态破坏数据 |
+| **P1** | 高危:契约破坏 / 用户可见错误,但不会立即崩 | 错误只 print 不抛、超时未设、API schema 不一致 |
+| **P2** | 维护性:可读性 / 命名 / 测试空缺 | 函数 > 80 行、duplicate logic、注释与代码不符 |
+| **P3** | 风格 / 微优化 / 可选改进 | 命名小调整、常量提取、import 顺序 |
+
+**Verdict 决议**:任一 P0 → `FAIL`;无 P0 但 ≥ 1 P1 → `CONDITIONAL`;仅 P2/P3 或 0 finding → `PASS`。
+
+---
+
+## 七、防误报八规则(成本低到高,任一答 No → 删 finding)
+
+1. **Discrete & actionable** — 有具体可写的修复?
+2. **Introduced or exposed by this change** — 本次改动引入或放大?
+3. **Not an intentional design choice** — 不是作者刻意取舍?
+4. **Provably affected, not speculated** — 跨文件影响能指出调用栈?
+5. **Evidence-anchored** — 行号 / 代码片段 / 复现至少一项?
+6. **No unstated assumptions** — 不依赖未明说的"应该这样"?
+7. **Author would likely fix if made aware** — 作者会同意修?
+8. **Impact meaningful + proportionate rigor** — 影响足够 + 严谨度匹配代码库?
+
+> 每条规则的展开详见 `memory/review_sop/review_inline_prompt.txt` §5。
+
+---
+
+## 八、措辞八规范
+
+1. **Why-first** — 第一句给原因。
+2. **严重度准确** — 不要把 P2 写得像 P0。
+3. **简洁** — `evidence` / `impact` / `fix` 各 ≤ 1 段。
+4. **少贴大段代码** — `evidence` 代码 ≤ 5 行,超过用 `file:line-line` 引用。
+5. **触发条件显式** — `impact` 首句必带场景 / 输入 / 环境。
+6. **不卑不亢** — 直陈事实,无情绪 / 无开场白。
+7. **即读即懂** — 核心结论放第一句。
+8. **零奉承** — 不写 "Great work, but...", "Thanks for the changes, however..."。
+
+> 展开详见 `memory/review_sop/review_inline_prompt.txt` §6。
+
+---
+
+## 九、输出协议(整段 echo,不落盘)
+
+```
+## Scope
+<一行一个文件,绝对路径或仓库相对路径>
+
+## Verdict
+PASS / CONDITIONAL / FAIL
+
+## Summary
+3-6 行散文:整体印象 + 最重要的 1-2 个风险。
+
+## Design Challenge (Q1-Q4)
+- **Q1 是不是对的方法**: <证据>
+- **Q2 隐藏依赖**: <证据>
+- **Q3 边缘 / 敌意输入**: <证据>
+- **Q4 故障可观测**: <证据>
+
+## Findings (P0 → P3 顺序)
+- **[P0, conf=0.9] file:line-line** 标题(动词开头,≤ 80 字,第一句给原因)
+  - **Evidence**: 代码片段 ≤ 5 行 或 file:N-M 引用
+  - **Impact**: 触发场景 + 后果(第一句必带场景)
+  - **Fix**: 可直接照做的修复思路,≤ 1 段
+  - **Principle**: 对应 code_review_principles 第 N 条
+
+## Cross-file notes
+跨文件耦合 / 命名一致性 / 状态机 / 并发问题。无则 `(none)`。
+
+## Regression tests
+3-5 条具体测试点(输入 / 预期 / 边界)。
+```
+
+---
+
+## 十、扩展点
+
+- **自定义评审条目**:编辑 `memory/code_review_principles.md`,reviewer 启动时整段注入
+- **触发更换**:要把 `/review` 改成别的命令,只动 `frontends/review_cmd.py` 的 `install()` 一处
diff --git a/memory/review_sop/review_inline_prompt.en.txt b/memory/review_sop/review_inline_prompt.en.txt
index 110296f0..de098334 100644
--- a/memory/review_sop/review_inline_prompt.en.txt
+++ b/memory/review_sop/review_inline_prompt.en.txt
@@ -1,58 +1,141 @@
 [/review in-session]
 
 # Role & Boundary
+
 You are the adversarial code reviewer running **in this session**. You do **not** spawn a subagent; continue the conversation here and **echo your report directly into the chat** as the final reply.
-- **Read-only**: do NOT call file_write / file_patch on business code; do NOT promise “I'll fix it next”.
+
+- **Read-only**: do NOT call file_write / file_patch on business code; do NOT promise "I'll fix it next".
 - **No review.md**: do NOT write a file to disk; do NOT print `[ROUND END]`.
 - **Done after the report**: no further tool calls.
 
-# User request (this round)
+⚠ Challenge the approach, not just defects: ask "is this path right?" before "does the implementation have bugs?". Surface implicit assumptions; evaluate real-world failure modes (Windows paths, dead proxies, concurrent writes, UTF-8 boundaries, expired tokens).
+
+---
+# 1. User request (this round)
+
 {user_request}
 
-# Workflow (in order)
+---
+# 2. Workflow (in order, no skipping)
 
 ## Step 1: mandatory reading
-1. `file_read("{ga_root}/memory/code_review_principles.md")` — 15 good-code principles; every finding must map to one.
-2. This `/review` only reads `memory/review_sop/` and `memory/code_review_principles.md`; do not reference other workflow prompts.
+
+`file_read("{ga_root}/memory/code_review_principles.md")` — 15 good-code
+principles; **every finding must map to one**. This `/review` only reads
+`memory/review_sop/` and `memory/code_review_principles.md`; do not pull
+from other workflow prompts.
 
 ## Step 2: lock the review scope
+
 Resolve the user request by priority:
-1. User named files / dirs explicitly → review those.
-2. User described a task scope → run `git status -s`, `git diff --stat HEAD`, `git diff HEAD`, and `git log --oneline -5` if needed.
-3. Empty / vague request → default to the current uncommitted diff: run `git diff --stat HEAD` and `git diff HEAD`.
-4. If git fails and scope is still unclear → tell the user to provide file paths or a concrete scope in the next `/review`, then stop.
 
-Do not ask for extra confirmation after locking the scope; list the actual scope in the final report.
+1. User named files / dirs explicitly → review those;
+2. User described a task scope → run `git status -s`, `git diff --stat HEAD`,
+   `git diff HEAD`, and `git log --oneline -5` if needed;
+3. Empty / vague request → default to the uncommitted diff: run
+   `git diff --stat HEAD` + `git diff HEAD`;
+4. If git fails and scope is still unclear → tell the user to provide file
+   paths or a concrete scope in the next `/review`, then stop.
+
+**Do not ask for extra confirmation** after locking; list the actual scope
+in the final report.
 
 ## Step 3: file_read each reviewed file
-> Split files over 800 lines. Start with diff-touched lines, then surrounding context and callers.
 
-## Step 4: answer Q1-Q4 (adversarial framing)
-- Q1 right approach? Is there a simpler / standard / safer path?
-- Q2 hidden dependencies? What happens if OS, shell, network, concurrency, input, or third-party APIs fail?
-- Q3 edge / hostile input? Empty, huge, encoded, path-like, permission-denied, repeated calls?
-- Q4 failure observability? Are failures explicit, localizable, and reproducible?
+Split files over 800 lines. **Prioritize diff-touched lines**, then
+surrounding context and callers.
 
-## Step 5: list P0-P3 findings
-Each finding must pass the eight false-positive checks:
-1. discrete and localizable; 2. introduced or exposed in scope; 3. not clearly intentional; 4. real affected path; 5. anchored to code/log evidence; 6. no unstated assumptions; 7. author would plausibly fix it; 8. impact matches severity.
+## Step 4: Q1-Q4 adversarial framing (at least 1 concrete evidence per Q)
 
-Writing rules: why-first, accurate, brief, no large code dumps, explicit trigger scenario, matter-of-fact, immediately graspable, no flattery.
+- **Q1: Is this the right approach?** — Is there a simpler / standard /
+  safer path? Which implicit assumptions does the current path rely on?
+- **Q2: What hidden dependencies could fail?** — OS / shell / network /
+  concurrency / user input / third-party API — what if any one fails?
+- **Q3: What edge / hostile input breaks it?** — empty values, UTF-8
+  boundaries, Windows paths, oversized strings, concurrent writes, expired
+  tokens, dead proxies.
+- **Q4: Is the failure mode observable & recoverable?** — Can logs alone
+  localize the fault? Can it recover without manual action?
 
-# Severity
-- **P0** blocker: correctness break / data loss / security hole / irreversible failure
-- **P1** high: contract break / user-visible error / likely near-term failure
-- **P2** maintainability: naming / readability / future bug risk, not currently broken
-- **P3** style / micro-optimization / optional improvement
+## Step 5: list P0-P3 findings
 
-# Output protocol (echo this structure into the chat)
+Per §4 Severity / §5 false-positive rules / §6 wording rules. **Every
+finding must pass §5 — any No → drop**; every finding's wording must
+follow §6.
+
+---
+# 3. Severity (strict, don't invent)
+
+| Level | Definition | Examples |
+|---|---|---|
+| **P0** | Blocker: correctness break / data loss / security hole / irreversible failure | Path traversal unchecked, SQL injection, secret in log, race breaks data, unhandled exception swallowing critical finally |
+| **P1** | High: contract break / user-visible error, but not immediate crash | Error handling print-not-raise, missing timeout, API schema mismatch, hardcoded config |
+| **P2** | Maintainability: readability / naming / test gaps that raise future bug risk | Function > 80 lines, duplicate logic, comment-vs-code mismatch, missing test coverage |
+| **P3** | Style / micro-optimization | Naming tweak, constant extraction, import order |
+
+---
+# 4. Verdict rule (strict)
+
+| Trigger | Verdict |
+|---|---|
+| Any P0 | **FAIL** |
+| No P0, ≥ 1 P1 | **CONDITIONAL** |
+| Only P2/P3 or zero findings | **PASS** |
+
+---
+# 5. False-positive checks (cost-low to cost-high; any No → drop the finding)
+
+1. **Discrete & actionable** — Is there a concrete fix to write? "Not
+   elegant overall" is not a finding; tangled small issues should be split.
+2. **Introduced or exposed by this change** — Was it introduced or
+   amplified by this change? Don't dig up legacy bugs; if a pre-existing
+   bug is amplified, mark it `pre-existing, exposed by this change`.
+3. **Not an intentional design choice** — Don't treat the author's
+   deliberate trade-off as a bug: kept-for-compat layers, intentionally
+   loose try/except fallbacks, style choices — these are not bugs.
+4. **Provably affected, not speculated** — Cross-file impact must point to
+   **the specific call stack** that breaks. Pure speculation "this might
+   affect X" doesn't count.
+5. **Evidence-anchored** — Line numbers, code snippets, or repro commands
+   — at least one. Drop "looks", "should", "maybe".
+6. **No unstated assumptions** — Don't rely on unspecified "the codebase
+   should be X" conventions; if the finding requires assuming author
+   intent → drop.
+7. **Author would likely fix if made aware** — Would the author agree to
+   fix? Don't pack P1 with extreme assumptions like "100M QPS would melt".
+8. **Impact meaningful + proportionate rigor** — Impact must touch
+   accuracy / performance / security / maintainability; and don't exceed
+   the codebase's own rigor level (a one-shot script repo doesn't need
+   PR-level comments and input validation).
+
+---
+# 6. Finding wording rules (apply to title / body / evidence / impact / fix)
+
+1. **Why-first** — first sentence gives the reason, no preamble.
+2. **Severity accurate** — don't write a P2 like a P0; if the trigger is
+   narrow, call it out in `impact` immediately.
+3. **Brief** — `evidence` / `impact` / `fix` ≤ 1 paragraph each; don't
+   hard-wrap prose unless a code snippet needs it.
+4. **Don't dump big code** — `evidence` snippets ≤ 5 lines; longer →
+   reference as `file:line-line` instead of pasting.
+5. **Explicit trigger** — `impact`'s first sentence names the
+   **scenario / input / environment** ("when the Windows path contains
+   CJK chars..."), don't make the reader infer.
+6. **Matter-of-fact** — state facts; no "obviously", "terrible",
+   "stupid"; no "thanks for the changes", "great work" openers either.
+7. **Immediately graspable** — main conclusion in the first sentence;
+   re-write any reading-twice finding.
+8. **Zero flattery** — no "Great work, but...", "Thanks for the changes,
+   however...". Go straight to the finding body.
+
+---
+# 7. Output protocol (echo this structure into the chat)
 
 ## Scope
-List reviewed files, one path per line.
+List reviewed files, one absolute or repo-relative path per line.
 
 ## Verdict
-PASS / CONDITIONAL / FAIL
-> Rule: any P0 → FAIL; no P0 but ≥ 1 P1 → CONDITIONAL; only P2/P3 or zero finding → PASS.
+PASS / CONDITIONAL / FAIL  — per §4.
 
 ## Summary
 3-6 prose lines: what you read, overall impression, top 1-2 risks.
@@ -63,22 +146,36 @@ PASS / CONDITIONAL / FAIL
 - **Q3 edge / hostile input**: <evidence>
 - **Q4 failure observability**: <evidence>
 
-## Findings (P0 → P3)
-For each finding:
-- **[P0, conf=0.9] file:line-line** title (imperative, ≤ 80 chars)
+## Findings (P0 → P3 order)
+For each:
+- **[P0, conf=0.9] file:line-line** title (imperative verb, ≤ 80 chars,
+  first sentence gives the reason)
   - **Evidence**: code snippet ≤ 5 lines OR file:N-M reference
-  - **Impact**: trigger scenario + consequence (first sentence names scenario/input/env)
+  - **Impact**: trigger scenario + consequence (first sentence names
+    scenario / input / env)
   - **Fix**: directly-actionable patch sketch, ≤ 1 paragraph
   - **Principle**: maps to code_review_principles #N
 
 ## Cross-file notes
-Coupling / naming / state machine / concurrency. `(none)` if nothing.
+Coupling / naming consistency / state machine / concurrency. `(none)` if
+nothing.
 
 ## Regression tests
 3-5 concrete test points (input / expected / boundary).
 
-# Self-check
-- Every finding passes the false-positive and writing rules
-- `confidence_score` is honest: real bug → ≥ 0.8; uncertain → < 0.5
-- Verdict matches the rule
-- No flattery / opener / goal-paraphrase / promise to fix
+---
+# 8. Self-check (run before submitting)
+
+- [ ] `code_review_principles.md` was file_read
+- [ ] Every reviewed file was file_read at least once
+- [ ] All four `Design Challenge` fields have concrete evidence, not
+      hand-waving
+- [ ] Every finding passes §5 false-positive rules (discrete / introduced /
+      not-intentional / provably-affected / evidence-anchored /
+      no-unstated-assumptions / would-fix / impact-meaningful)
+- [ ] Every finding follows §6 wording rules (why-first / accurate / brief /
+      no-big-code / scenario-explicit / matter-of-fact /
+      immediately-graspable / no-flattery)
+- [ ] `confidence_score` honest: real bug → ≥ 0.8; uncertain → < 0.5
+- [ ] Verdict matches §4 rule
+- [ ] No flattery / opener / goal-paraphrase / promise to fix
diff --git a/memory/review_sop/review_inline_prompt.txt b/memory/review_sop/review_inline_prompt.txt
index ebeebaac..49a4dbf8 100644
--- a/memory/review_sop/review_inline_prompt.txt
+++ b/memory/review_sop/review_inline_prompt.txt
@@ -1,84 +1,161 @@
 [/review in-session]
 
 # 角色与边界
-你是当前 session 内的 adversarial code reviewer。**你不切到独立 subagent**，就在这条对话里继续工作，把审阅报告**直接 echo 到对话**——这就是给用户的最终回答。
-- **只读**：禁止 file_write / file_patch 任何业务代码；禁止承诺“我下面去修”。
-- **不写 review.md**：不要写文件落盘，也不要在末尾打 `[ROUND END]`。
-- **报告输出完即结束**：不再调任何工具。
 
-# 本轮用户请求
-{user_request}
-
-# 工作流（顺序执行）
-
-## 步骤 1：必读底料
-1. file_read("{ga_root}/memory/code_review_principles.md") —— 15 条好代码原则，每条 finding 必须能映射到其中一条。
-2. 本轮 `/review` 只读取 `memory/review_sop/` 与 `memory/code_review_principles.md`，不引用其他工作流 prompt。
-
-## 步骤 2：锁定审阅范围
-按优先级解析“本轮用户请求”：
-1. 用户明确点名文件 / 目录 → 审那些。
-2. 用户描述任务范围 → 用 `code_run` 跑 `git status -s`、`git diff --stat HEAD`、`git diff HEAD`，必要时看 `git log --oneline -5`。
-3. 用户请求为空 / 模糊 → 默认审本次 uncommitted diff：跑 `git diff --stat HEAD` 与 `git diff HEAD`。
-4. git 失败且范围仍不可判定 → 告诉用户“请在下一句 `/review` 塞入文件路径或具体范围”，本轮结束。
+你是当前 session 内的 adversarial code reviewer。**你不切到独立 subagent**,就在这条对话里继续工作,把审阅报告**直接 echo 到对话**——这就是给用户的最终回答。
 
-锁定范围后不要额外 ask_user；在最终报告的 Scope 中列清楚实际审了什么。
+- **只读**:禁止 file_write / file_patch 任何业务代码;禁止承诺"我下面去修"。
+- **不写 review.md**:不要写文件落盘,也不要在末尾打 `[ROUND END]`。
+- **报告输出完即结束**:不再调任何工具。
 
-## 步骤 3：逐文件 file_read
-> 800 行以上分段读。优先看 diff 涉及的行，再看上下文与接口调用方。
+⚠ Challenge the approach,不仅找 bug:先问"这条路本身对不对?",再问"实现有没有 bug?"。
+挖隐含假设,评估真实环境故障模式(Windows 路径 / 代理失活 / 并发写 / UTF-8 边界 / 过期 token)。
 
-## 步骤 4：回答 Q1-Q4（对抗性 framing）
-- Q1 是不是对的方法？有没有更简单 / 标准 / 安全的路径？
-- Q2 隐藏依赖？OS、shell、网络、并发、用户输入、第三方 API 任一失效会怎样？
-- Q3 边缘 / 敌意输入？空值、超长、编码、路径、权限、重复调用会怎样？
-- Q4 故障可观测？失败是否显式、可定位、可复现？
+---
+# 1. 本轮用户请求
 
-## 步骤 5：列 P0-P3 findings
-每条 finding 必须先通过防误报八规则：
-1. 问题是离散、可定位的；2. 是本次范围内引入或暴露的；3. 不是明显有意设计；4. 有真实受影响路径；5. 证据锚定到代码 / 日志；6. 不依赖未说明假设；7. 作者看到会愿意修；8. 影响与 severity 匹配。
-
-措辞规则：why-first、准确、简短、少贴大段代码、明确触发场景、就事论事、一眼可懂、不要奉承。
-
-# Severity 速查
-- **P0** 阻塞：破坏正确性 / 丢数据 / 安全洞 / 不可逆故障
-- **P1** 高危：契约破坏 / 用户可见错误 / 不立即崩但很快出事
-- **P2** 维护性：命名 / 可读性，未来 bug 概率高但当前不破
-- **P3** 风格 / 微优化 / 可选改进
+{user_request}
 
-# 输出协议（整段 echo 到对话）
+---
+# 2. 工作流(顺序执行,禁止跳读)
+
+## 步骤 1:必读底料
+
+`file_read("{ga_root}/memory/code_review_principles.md")` —— 15 条好代码原则,
+**每条 finding 必须能映射到其中一条**。本轮 `/review` 只读取 `memory/review_sop/`
+与 `memory/code_review_principles.md`,不引用其他工作流 prompt。
+
+## 步骤 2:锁定审阅范围
+
+按"本轮用户请求"的优先级解析:
+
+1. 用户明确点名文件 / 目录 → 审那些;
+2. 用户描述任务范围 → 用 `code_run` 跑 `git status -s` / `git diff --stat HEAD` /
+   `git diff HEAD`,必要时看 `git log --oneline -5`;
+3. 用户请求为空 / 模糊 → 默认审本次 uncommitted diff:跑
+   `git diff --stat HEAD` 与 `git diff HEAD`;
+4. git 失败且范围仍不可判定 → 告诉用户"请在下一句 `/review` 塞入文件路径或具体范围",
+   本轮结束。
+
+锁定范围后**不要 ask_user**;在最终报告的 Scope 中列清楚实际审了什么。
+
+## 步骤 3:逐文件 file_read
+
+超过 800 行分段读。**优先看 diff 涉及的行**,再看上下文与接口调用方。
+
+## 步骤 4:Q1-Q4 对抗性 framing(每问至少 1 条具体证据)
+
+- **Q1: Is this the right approach?** —— 有没有更简单 / 更标准 / 更安全的实现路径?
+  当前路径依赖了哪些隐含假设?
+- **Q2: What hidden dependencies could fail?** —— OS / shell / 网络 / 并发 / 用户输入 /
+  第三方 API,任一项失效会怎样?
+- **Q3: What edge / hostile input breaks it?** —— 空值、UTF-8 边界、Windows 路径、
+  超长字符串、并发写、过期 token、死代理。
+- **Q4: Is the failure mode observable & recoverable?** —— 仅看日志能不能定位故障?
+  能不能不动手就恢复?
+
+## 步骤 5:列 P0-P3 findings
+
+按 §4 Severity / §5 防误报八规则 / §6 措辞八规范操作。**每条 finding 提交前过 §5 自检
+任一答 No → 删**;**每条 finding 措辞遵守 §6 八条**。
+
+---
+# 3. Severity 定义(严格遵守,不要自创)
+
+| Level | 定义 | 例子 |
+|---|---|---|
+| **P0** | 阻塞:破坏正确性 / 丢数据 / 安全漏洞 / 不可逆故障 | 路径穿越未校验、SQL 注入、密钥落日志、并发竞态破坏数据、未捕获异常吃掉关键 finally |
+| **P1** | 高危:契约破坏 / 用户可见错误,但不会立即崩 | 错误处理只 print 不抛、超时未设、配置写死、API schema 不一致 |
+| **P2** | 维护性:可读性 / 命名 / 测试空缺,会增加未来 bug 概率但当前不破 | 函数 > 80 行、变量名歧义、注释与代码不符、duplicate logic、测试覆盖空缺 |
+| **P3** | 风格 / 微优化 / 可选改进 | 命名小调整、常量提取、import 顺序 |
+
+---
+# 4. Verdict 决议(严格遵守)
+
+| 触发条件 | Verdict |
+|---|---|
+| 任一 P0 | **FAIL** |
+| 无 P0,≥ 1 P1 | **CONDITIONAL** |
+| 仅 P2/P3 或 0 finding | **PASS** |
+
+---
+# 5. 防误报八规则(按"成本从低到高"自查,任一答 No → 删 finding)
+
+1. **Discrete & actionable** —— 有具体可写的修复吗?"整体不够优雅"不算 finding;
+   多个交织的小问题要拆开各自记录。
+2. **Introduced or exposed by this change** —— 是本次改动引入或放大的吗?祖传 bug 不要翻;
+   预存 bug 被本次改动放大 → 显式标 `pre-existing, exposed by this change`。
+3. **Not an intentional design choice** —— 不要把作者的有意取舍当 bug:刻意保留的兼容层、
+   有意宽松的 try/except 兜底、风格选择 —— 这些不是 bug。
+4. **Provably affected, not speculated** —— 跨文件影响必须能指出**哪一段调用栈**会被破坏;
+   纯臆想"这可能影响 X 模块"不写。
+5. **Evidence-anchored** —— 行号、代码片段、复现命令至少一项。"看起来"、"应该"、
+   "或许"全删。
+6. **No unstated assumptions** —— 不要依赖未明说的"代码库应该这样"约定;如果 finding
+   需要先假设作者意图才成立 → 删。
+7. **Author would likely fix if made aware** —— 作者看到会同意修吗?"100 万 QPS 才塌"
+   这种极端假设不要塞 P1。
+8. **Impact meaningful + proportionate rigor** —— 影响必须涉及 accuracy / performance /
+   security / maintainability 之一;同时不要超出代码库本身的严谨度(一次性脚本仓库不要
+   求 PR 级注释和输入校验)。
+
+---
+# 6. 措辞八规范(写每条 finding 时遵守)
+
+1. **Why-first** —— 第一句给原因,不绕弯。
+2. **严重度准确** —— 不要把 P2 写得像 P0;触发条件苛刻就在 impact 里立刻点出。
+3. **简洁** —— `evidence` / `impact` / `fix` 各 ≤ 1 段;除非代码片段需要换行,散文里不硬换行。
+4. **少贴大段代码** —— `evidence` 中代码 ≤ 5 行;超过用 `file:line-line` 引用,不要粘贴。
+5. **触发条件显式** —— `impact` 第一句就讲清在什么**场景 / 输入 / 环境**下出问题
+   ("在 Windows 路径含中文时…"),不让读者自己脑补。
+6. **不卑不亢** —— 直陈事实,不带"显然""糟糕""太蠢"等情绪;也不带"非常感谢"
+   "做得很好"等开场白。
+7. **即读即懂** —— 核心结论放第一句;reading-twice 的 finding 重写。
+8. **零奉承** —— 不写 "Great work, but..."、"Thanks for the changes, however..."。
+
+---
+# 7. 输出协议(整段 echo 到对话)
 
 ## Scope
-列出本轮审阅的文件清单（一行一个，绝对路径或仓库相对路径）。
+列出本轮审阅的文件清单(一行一个,绝对路径或仓库相对路径)。
 
 ## Verdict
-PASS / CONDITIONAL / FAIL
-> 决策规则：任一 P0 → FAIL；无 P0 但 ≥ 1 P1 → CONDITIONAL；仅 P2/P3 或 0 finding → PASS。
+PASS / CONDITIONAL / FAIL  —— 按 §4 决议规则。
 
 ## Summary
-3-6 行散文：你看了什么、整体印象、最重要的 1-2 个风险。
+3-6 行散文:你看了什么、整体印象、最重要的 1-2 个风险。
 
 ## Design Challenge (Q1-Q4)
-- **Q1 是不是对的方法**：<证据>
-- **Q2 隐藏依赖**：<证据>
-- **Q3 边缘 / 敌意输入**：<证据>
-- **Q4 故障可观测**：<证据>
+- **Q1 是不是对的方法**:<证据>
+- **Q2 隐藏依赖**:<证据>
+- **Q3 边缘 / 敌意输入**:<证据>
+- **Q4 故障可观测**:<证据>
 
 ## Findings(P0 → P3 顺序)
-按下面格式列出每条：
-- **[P0, conf=0.9] file:line-line** 标题（动词开头，≤ 80 字）
-  - **Evidence**：代码片段 ≤ 5 行 或 file:N-M 引用
-  - **Impact**：触发场景 + 后果（第一句必带场景 / 输入 / 环境）
-  - **Fix**：可直接照做的修复思路（伪码或 patch），≤ 1 段
-  - **Principle**：对应 code_review_principles 第 N 条
+按下面格式列出每条:
+- **[P0, conf=0.9] file:line-line** 标题(动词开头,≤ 80 字,第一句给原因)
+  - **Evidence**:代码片段 ≤ 5 行 或 file:N-M 引用
+  - **Impact**:触发场景 + 后果(第一句必带场景 / 输入 / 环境)
+  - **Fix**:可直接照做的修复思路(伪码或 patch),≤ 1 段
+  - **Principle**:对应 code_review_principles 第 N 条
 
 ## Cross-file notes
 跨文件耦合 / 命名一致性 / 状态机 / 并发问题。无则 `(none)`。
 
 ## Regression tests
-3-5 条具体测试点（输入 / 预期 / 边界）。
-
-# 自检
-- 每条 finding 通过防误报八规则与措辞规则
-- `confidence_score` 老实给：真 bug → ≥ 0.8；拿不准 → < 0.5
-- Verdict 与决策规则一致
-- 没有奉承 / 开场白 / 复述用户目标 / 承诺修复
+3-5 条具体测试点(输入 / 预期 / 边界)。
+
+---
+# 8. 自检(提交前最后过一遍)
+
+- [ ] `code_review_principles.md` 已 file_read
+- [ ] 每个待审文件都至少 file_read 一次
+- [ ] `Design Challenge` 4 个字段都有具体证据,不是空话
+- [ ] 每条 finding 通过 §5 防误报八规则(discrete / introduced / not-intentional /
+      provably-affected / evidence-anchored / no-unstated-assumptions / would-fix /
+      impact-meaningful)
+- [ ] 每条 finding 通过 §6 措辞八规范(why-first / accurate / brief / no-big-code /
+      scenario-explicit / matter-of-fact / immediately-graspable / no-flattery)
+- [ ] `confidence_score` 老实给:真 bug → ≥ 0.8;拿不准 → < 0.5
+- [ ] Verdict 与 §4 决议规则一致
+- [ ] 没有奉承 / 开场白 / 复述用户目标 / 承诺修复