diff --git a/.gitignore b/.gitignore index 1bbf72e4..6aab88e2 100644 --- a/.gitignore +++ b/.gitignore @@ -101,6 +101,9 @@ memory/L4_raw_sessions/* # Code Review Principles !memory/code_review_principles.md +# Review Mode SOP +!memory/review_sop.md + # Visual Studio .vs/ restore_commit.txt diff --git a/frontends/cost_tracker.py b/frontends/cost_tracker.py index 86d900b0..75c7396a 100644 --- a/frontends/cost_tracker.py +++ b/frontends/cost_tracker.py @@ -1,13 +1,14 @@ -"""Per-thread LLM token usage, captured via llmcore monkey-patches. - -`install()` wraps `llmcore._record_usage` (covers all three API modes) and -`llmcore.print` (the `messages` SSE path emits the final `output_tokens` -only via `[Output] tokens=N`, never through `_record_usage`). Tracking is -keyed by `threading.current_thread().name`; each TUI session runs the -agent on a uniquely named thread (`ga-tui-agent-`), so `/cost` is a -thread lookup. +"""Per-thread LLM token usage via llmcore monkey-patches. + +`install()` wraps `llmcore._record_usage` + `llmcore.print` (the SSE +`messages` path only emits final `output_tokens` through `[Output] tokens=N`). +Trackers are keyed by `threading.current_thread().name`; each TUI session +runs its agent on `ga-tui-agent-`, so `/cost` is a thread lookup. + +Subagent processes are out-of-process, so `scan_subagent_logs` parses the +same `[Cache]` / `[Output]` print lines from `temp/*/stdout.log`. """ -import re, threading, time +import glob, os, re, threading, time from dataclasses import dataclass, field @@ -18,8 +19,9 @@ class TokenStats: output: int = 0 cache_create: int = 0 cache_read: int = 0 - # Latest request's effective prompt size — used for the % context-left line. + # Latest single-LLM-call sizes — drive the spinner's `↑ N · ↓ M`. last_input: int = 0 + last_output: int = 0 started_at: float = field(default_factory=time.time) def total_input_side(self) -> int: @@ -36,50 +38,69 @@ def elapsed_seconds(self) -> float: return max(0.0, time.time() - self.started_at) -# Best-effort model → context window. `startswith` match; None hides the line. -_CTX_LIMITS: list[tuple[str, int]] = [ - ("claude-sonnet-4-5", 1_000_000), - ("claude-opus-4", 200_000), - ("claude-haiku-4", 200_000), - ("claude-sonnet-4", 200_000), - ("claude-3-5-sonnet", 200_000), - ("claude-3-5-haiku", 200_000), - ("claude-3-7-sonnet", 200_000), - ("claude-3-opus", 200_000), - ("claude-3-haiku", 200_000), - ("claude-3-sonnet", 200_000), - ("gpt-5-pro", 400_000), - ("gpt-5", 256_000), - ("gpt-4o", 128_000), - ("gpt-4-turbo", 128_000), - ("gpt-4", 8_192), - ("o1", 200_000), - ("o3", 200_000), - ("o4", 200_000), - ("gemini-2.5", 2_000_000), - ("gemini-2", 1_000_000), - ("gemini-1.5", 1_000_000), - ("glm-5", 256_000), - ("glm-4", 128_000), - ("qwen", 128_000), - ("deepseek", 64_000), - ("kimi", 200_000), - ("moonshot", 200_000), -] - - -def context_limit_for(model: str | None) -> int | None: - if not model: return None - m = model.lower() - for prefix, limit in _CTX_LIMITS: - if m.startswith(prefix): return limit - return None +# GA's real context budget lives on `BaseSession.context_win` (chars). The +# trim trigger is `context_win * 3` (see llmcore.trim_messages_history), so +# `/cost` compares actual-history chars against that cap for consistent units. +def context_window_chars(backend) -> int: + """`context_win * 3` — the char cap before `trim_messages_history` kicks + in. Reads dynamically so a `mykey.py` override propagates. Returns 0 on + bad/missing backend so the caller can hide the row.""" + try: + return int(getattr(backend, 'context_win', 0)) * 3 + except (TypeError, ValueError): + return 0 + + +def current_input_chars(backend) -> int: + """Char-size of the message history (same unit as `trim_messages_history`).""" + try: + import json as _json + history = getattr(backend, 'history', None) or [] + return sum(len(_json.dumps(m, ensure_ascii=False)) for m in history) + except Exception: + return 0 _trackers: dict[str, TokenStats] = {} _lock = threading.Lock() _OUT_RE = re.compile(r'\[Output\]\s+tokens=(\d+)') +_CACHE_RE_NEW = re.compile(r'\[Cache\]\s+input=(\d+)\s+creation=(\d+)\s+read=(\d+)') +_CACHE_RE_OLD = re.compile(r'\[Cache\]\s+input=(\d+)\s+cached=(\d+)') _INSTALLED = False +_SUBAGENT_GLOB = os.path.join("temp", "*", "stdout.log") + + +def scan_subagent_logs(since: float = 0.0, root: str | None = None) -> TokenStats: + """Aggregate subagent tokens from `temp//stdout.log` files; pass + `since=tui_start_time` to scope to this run. Best-effort: bad logs skipped.""" + out = TokenStats() + if since > 0: out.started_at = since + pattern = os.path.join(root, _SUBAGENT_GLOB) if root else _SUBAGENT_GLOB + for p in glob.glob(pattern): + try: + if since and os.path.getmtime(p) < since: continue + with open(p, encoding="utf-8", errors="ignore") as f: + for line in f: + if line.startswith("[Output]"): + m = _OUT_RE.match(line) + if m: + out.output += int(m.group(1)); out.requests += 1 + elif line.startswith("[Cache]"): + # messages → `input=N creation=C read=R` (input excl. cache); + # chat_completions / responses → `input=N cached=R` (input incl. cached). + m = _CACHE_RE_NEW.match(line) + if m: + i, c, r = int(m.group(1)), int(m.group(2)), int(m.group(3)) + out.input += i + out.cache_create += c; out.cache_read += r + continue + m = _CACHE_RE_OLD.match(line) + if m: + i, r = int(m.group(1)), int(m.group(2)) + out.input += max(0, i - r); out.cache_read += r + except OSError: + continue + return out def get(thread_name: str) -> TokenStats: @@ -107,31 +128,32 @@ def install() -> None: orig_record, orig_print = llmcore._record_usage, print def record_patched(usage, api_mode): + # Handles INPUT / CACHE only; OUTPUT comes via `[Output]` print_patched + # below (the SSE path emits it that way; double-counting was the prior bug). try: if usage: t = get(threading.current_thread().name) t.requests += 1 if api_mode == 'messages': - # SSE delivers final output via [Output] print; non-stream - # delivers it here. `output_tokens` in stream message_start - # is a 0–1 placeholder, acceptable noise. inp = int(usage.get('input_tokens', 0) or 0) cc = int(usage.get('cache_creation_input_tokens', 0) or 0) cr = int(usage.get('cache_read_input_tokens', 0) or 0) t.input += inp; t.cache_create += cc; t.cache_read += cr - t.output += int(usage.get('output_tokens', 0) or 0) + # Non-stream `messages` skips the [Output] print, so count + # output_tokens here; SSE message_start carries a 1-token + # placeholder to skip. + out = int(usage.get('output_tokens', 0) or 0) + if out > 1: t.output += out; t.last_output = out t.last_input = inp + cc + cr elif api_mode == 'chat_completions': cached = int((usage.get('prompt_tokens_details') or {}).get('cached_tokens', 0) or 0) inp = int(usage.get('prompt_tokens', 0) or 0) - cached t.input += inp; t.cache_read += cached - t.output += int(usage.get('completion_tokens', 0) or 0) t.last_input = inp + cached elif api_mode == 'responses': cached = int((usage.get('input_tokens_details') or {}).get('cached_tokens', 0) or 0) inp = int(usage.get('input_tokens', 0) or 0) - cached t.input += inp; t.cache_read += cached - t.output += int(usage.get('output_tokens', 0) or 0) t.last_input = inp + cached except Exception: pass return orig_record(usage, api_mode) @@ -141,7 +163,10 @@ def print_patched(*args, **kwargs): try: if args and isinstance(args[0], str): m = _OUT_RE.match(args[0]) - if m: get(threading.current_thread().name).output += int(m.group(1)) + if m: + t = get(threading.current_thread().name) + n = int(m.group(1)) + t.output += n; t.last_output = n except Exception: pass return orig_print(*args, **kwargs) llmcore.print = print_patched diff --git a/frontends/plan_state.py b/frontends/plan_state.py new file mode 100644 index 00000000..c93fb42e --- /dev/null +++ b/frontends/plan_state.py @@ -0,0 +1,180 @@ +"""Plan / todo state — pure stdlib, no UI framework dependency. + +API: + extract(text) → [(content, "open"|"done"), …] + is_active(agent, messages=None) → plan mode on (stash OR per-session msg ref) + resolve_path(agent, messages=None) → live plan.md path (or None) + find_path_in_messages(messages) → most recent plan.md path mentioned + current_step(messages) → latest `当前步骤:…` snippet (or "") + summary(items) → (n_done, n_total) + is_complete(items) → all done (or empty) + +Supported task-line shapes (all matched by `extract`): + - [ ] foo ← bullet + open + - [x] foo ← bullet + done + 1. [✓] foo ← numbered + done + 2. [✓ 2026-05-16] foo ← numbered + timestamped done, content after bracket + 3. [✓ 已生成: foo] ← numbered + done with description *inside* bracket + 4. [D][P] foo ← two marker groups (delegate + parallel), still open + 5. [D] foo ← non-standard marker "D" → open (not done) +""" +from __future__ import annotations +import os, re +from typing import Optional + +_DONE_CHARS = set("xX✓✔√☑") +# Newline-insert before a bullet stuck to JSON debris (`{"content": "- [ ] …`). +_GLUE_RE = re.compile(r"(? str: + return _MD_EMPHASIS_RE.sub(lambda m: next(g for g in m.groups() if g is not None), s) + + +def _has_done_glyph(marker: str) -> bool: + return any(c in _DONE_CHARS for c in marker) + + +def extract(text: str) -> list[tuple[str, str]]: + if not text: return [] + norm = text.replace("\\n", "\n") if "\\n" in text else text + norm = _GLUE_RE.sub(r"\n\1", norm) + found: dict[str, str] = {} + for line in norm.splitlines(): + head = _BULLET_RE.match(line) + if not head: continue + rest = line[head.end():] + groups: list[str] = [] + # Consume any number of consecutive `[...]` groups — covers `[D][P]` + # task-type chains as well as the plain `[ ]` / `[x]` single form. + while True: + b = _BRACKET_RE.match(rest) + if not b: break + groups.append(b.group(1)) + rest = rest[b.end():] + if not groups: continue + is_done = any(_has_done_glyph(g) for g in groups) + inline = rest.strip() + if inline: + content = inline + elif is_done: + # `[✓ description]` shape — description lives inside the bracket + # next to the glyph. Strip the glyph + optional timestamp. + done_g = next(g for g in groups if _has_done_glyph(g)) + content = _INLINE_STRIP_RE.sub("", done_g).strip() + else: + continue + k = _strip_md(_DEBRIS_RE.sub("", content).strip()) + if not k: continue + status = "done" if is_done else "open" + # Same content seen twice — done wins over open. + if k not in found or status == "done": + found[k] = status + return list(found.items()) + + +def _stashed_plan_path(agent) -> str: + # First non-empty `working['in_plan_mode']` from (handler, agent). + for src in (getattr(agent, "handler", None), agent): + p = ((getattr(src, "working", None) or {}).get("in_plan_mode") or "").strip() + if p: return p + return "" + + +def _resolve_stashed(p: str) -> Optional[str]: + if not p: return None + rel = p.lstrip("./\\") + cwd = os.getcwd() + for c in (p, os.path.join(cwd, "temp", rel), os.path.join(cwd, rel)): + if os.path.isfile(c) and os.path.getsize(c) > 0: return c + return None + + +# Strict per-session discovery — scan this session's own messages only. +_PATH_RE = re.compile(r"""((?:\.\/)?(?:temp\/)?plan_[A-Za-z0-9_\-]+\/plan\.md)""") + + +def _slice(messages, start_idx: int): + if not messages: return [] + if start_idx <= 0: return list(messages) + return list(messages)[start_idx:] + + +def find_path_in_messages(messages, start_idx: int = 0) -> Optional[str]: + """Latest existing `plan_XXX/plan.md` referenced after `start_idx`. + Items can be `ChatMessage`-like (`.content`) or plain strings; + only paths that exist on disk are returned.""" + sliced = _slice(messages, start_idx) + if not sliced: return None + for m in reversed(sliced): + text = getattr(m, "content", None) + if text is None: text = m if isinstance(m, str) else "" + if not text or "plan.md" not in text: continue + for hit in reversed(_PATH_RE.findall(text)): + p = _resolve_stashed(hit.strip().strip("\"'")) + if p: return p + return None + + +# Prefer concise `` narrative over the long plan-item echo; +# treat `❌ 当前步骤:` as "step done", not "current step". +_SUMMARY_STEP_RE = re.compile( + r"[^<]*?当前步骤[::]\s*([^<\n]{1,160})", re.DOTALL) +_STEP_RE = re.compile(r"📌\s*当前步骤[::]\s*([^\n。!!??]{1,160})") +_DONE_STEP_RE = re.compile(r"❌\s*当前步骤[::]") + + +def current_step(messages, start_idx: int = 0, max_len: int = 60) -> str: + """Latest `当前步骤:…` snippet; `` form preferred, `❌`-prefixed + skipped. Trimmed to `max_len` chars so it fits the 5-row plan card.""" + sliced = _slice(messages, start_idx) + if not sliced: return "" + + def _clean(s: str) -> str: + return _strip_md(re.sub(r"\s+", " ", s).strip().rstrip(" ::—-")) + + def _cap(s: str) -> str: + s = _clean(s) + if len(s) <= max_len: return s + return s[:max_len - 1].rstrip() + "…" + + for m in reversed(sliced): + text = getattr(m, "content", None) + if text is None: text = m if isinstance(m, str) else "" + if not text or "当前步骤" not in text: continue + hits = _SUMMARY_STEP_RE.findall(text) + if hits: return _cap(hits[-1]) + for raw in reversed(_STEP_RE.findall(text)): + if _DONE_STEP_RE.search(raw): continue + return _cap(raw) + return "" + + +def is_active(agent, messages=None, start_idx: int = 0) -> bool: + """Plan mode is on. Primary: `working['in_plan_mode']`. Fallback: + a `plan_*/plan.md` referenced in this session's messages (no global scan).""" + if _stashed_plan_path(agent): return True + return find_path_in_messages(messages, start_idx) is not None + + +def resolve_path(agent, messages=None, start_idx: int = 0) -> Optional[str]: + p = _resolve_stashed(_stashed_plan_path(agent)) + if p: return p + return find_path_in_messages(messages, start_idx) + + +def summary(items: list[tuple[str, str]]) -> tuple[int, int]: + return sum(1 for _, st in items if st == "done"), len(items) + + +def is_complete(items: list[tuple[str, str]]) -> bool: + return not items or all(st == "done" for _, st in items) diff --git a/frontends/tuiapp.py b/frontends/tuiapp.py index 07345d2c..235134bc 100644 --- a/frontends/tuiapp.py +++ b/frontends/tuiapp.py @@ -86,8 +86,11 @@ def stash(match: re.Match[str]) -> str: placeholders.append(match.group(0)) return f"\x00PH{len(placeholders) - 1}\x00" - safe = re.sub(r"`{4,}.*?`{4,}", stash, text, flags=re.DOTALL) - safe = re.sub(r"`{4,}[^`].*$", stash, safe, flags=re.DOTALL) + # Line-anchored fence matcher — see tuiapp_v2.fold_turns for rationale. + # Unanchored variant mis-paired backticks embedded in file_read output + # with later real fences, swallowing turn markers and ballooning the + # final "text" segment to MBs (1.85s markdown render on /continue). + safe = re.sub(r"^`{4,}.*?^`{4,}\n?", stash, text, flags=re.DOTALL | re.MULTILINE) parts = re.split(r"(\**LLM Running \(Turn \d+\) \.\.\.\**)", safe) def restore(part: str) -> str: diff --git a/frontends/tuiapp_v2.py b/frontends/tuiapp_v2.py index bfe072c9..2a8a78be 100644 --- a/frontends/tuiapp_v2.py +++ b/frontends/tuiapp_v2.py @@ -142,23 +142,26 @@ def _hint_terminal_capabilities() -> None: "Tip: /export clip 把上一条回复复制到剪贴板;/export all 给出完整日志路径。", "Tip: /branch [name] 从当前历史分裂出新会话,互不污染。", "Tip: ask_user 题目里写 [多选] 自动切到 SelectionList;任何 picker 都有 \"Type something\" 走自由输入。", - "Tip: plan 模式下的 todo 会自动渲染到顶部的 📋 Plan 面板,全部完成后自动消失。", + "Tip: plan 模式下的 todo 会渲染在消息区与输入框之间的 📋 Plan 卡片,完成后自动消失。", ) -def _random_tip() -> str: +def _random_tip(exclude: str = "") -> str: + """Pick a tip distinct from `exclude` so rotation doesn't repeat.""" import random - return random.choice(_TIPS) + pool = [t for t in _TIPS if t != exclude] or list(_TIPS) + return random.choice(pool) -def _tip_line(): - """Render `└ Tip: …` as a styled Rich Text. Used directly in compose() - so the first paint already includes the line — no post-mount race.""" +def _tip_line(text: str = ""): + """`└ Tip: …` as styled Rich Text; empty `text` → blank pulse line.""" from rich.text import Text as _T t = _T() + if not text: + return t t.append("└ ", style="#6e7681") t.append("Tip: ", style="bold #6e7681") - t.append(_random_tip().removeprefix("Tip: "), style="#6e7681") + t.append(text.removeprefix("Tip: "), style="#6e7681") return t # Defensive cleaners for ask_user candidates. The model occasionally smuggles @@ -239,8 +242,9 @@ def fold_turns(text: str) -> list[dict]: def stash(m): placeholders.append(m.group(0)) return f"\x00PH{len(placeholders) - 1}\x00" - safe = re.sub(r"`{4,}.*?`{4,}", stash, text, flags=re.DOTALL) - safe = re.sub(r"`{4,}[^`].*$", stash, safe, flags=re.DOTALL) + # Line-anchored so backticks embedded in tool output (e.g. `N|\`\`\`\`` + # gutter from file_read) don't pair with later real fences. + safe = re.sub(r"^`{4,}.*?^`{4,}\n?", stash, text, flags=re.DOTALL | re.MULTILINE) parts = re.split(r"(\**LLM Running \(Turn \d+\) \.\.\.\**)", safe) parts = [re.sub(r"\x00PH(\d+)\x00", lambda m: placeholders[int(m.group(1))], p) for p in parts] if len(parts) < 4: @@ -592,6 +596,7 @@ def _align_md_renders(narrow_raw: str, wide_raw: str): import chatapp_common # noqa: F401 from chatapp_common import format_restore from btw_cmd import handle_frontend_command as btw_handle +from review_cmd import handle as review_handle from continue_cmd import list_sessions as continue_list, extract_ui_messages as continue_extract from export_cmd import last_assistant_text, export_to_temp, wrap_for_clipboard @@ -801,6 +806,20 @@ def _palette_from_resolved_vars(v: dict[str, str], dark: bool) -> dict[str, str] scrollbar-color-active: $ga-dim; } +/* Plan/todo panel — fixed 5-row card between messages and composer. + `display: none` default so the empty post-compose frame doesn't flash; + renderer flips it on once items materialize. Fixed height (no scroll) + keeps layout stable; body truncates to 4 items + "+N more" footer. */ +#planbar { + display: none; + height: 5; + max-height: 5; + background: $ga-sel-bg; + padding: 0 1; + margin: 0 0 1 0; + border-left: thick $ga-green; +} + /* `└ Tip:` footer — one dim row, never grows. */ #tipbar { height: 1; @@ -924,16 +943,14 @@ class ChatMessage: _segment_widgets: list = field(default_factory=list, repr=False) _segment_sig: tuple = field(default=(), repr=False) _spinner_widget: Any = field(default=None, repr=False) - # Wall-clock start of streaming for this assistant turn — drives the spinner's - # `(Xm Ys · ↑ N.Nk · gerund...)` annotation. Set on first stream chunk. + # Stream start + token baselines so the spinner shows *this turn's* deltas. _stream_started_at: Optional[float] = field(default=None, repr=False) - # Token snapshot captured at stream start so the spinner can show *this turn's* - # input cost rather than the lifetime cumulative. _stream_baseline_input: int = field(default=0, repr=False) - # Per-segment rendered-Text cache keyed by (seg_content_hash, width). Survives - # fold-toggle because toggling visibility doesn't mutate any segment's content, - # so re-rendering the same Markdown twice is wasted work — this turns a ~60ms - # remount into a <5ms widget-rebuild even on long multi-turn messages. + _stream_baseline_output: int = field(default=0, repr=False) + # Frozen `(elapsed, last_in, last_out)` at done→True; keeps the post-turn + # card from ticking when the next turn shifts cost_tracker deltas. + _done_summary: Optional[tuple] = field(default=None, repr=False) + # Per-(seg_hash, width) Text cache; survives fold-toggle re-mounts. _seg_render_cache: dict = field(default_factory=dict, repr=False) @@ -954,19 +971,22 @@ class AgentSession: input_pastes: dict[int, str] = field(default_factory=dict) input_paste_counter: int = 0 buffer: str = "" - # Lazy-initialized in `_refresh_topbar` the first tick `status == "running"` - # is observed. Drives the topbar dot's heat-color ramp and the elapsed label. + # Drives topbar heat-color ramp + elapsed label; set on first running tick. _busy_since: Optional[float] = None - # When a run transitions running→idle we briefly flash the dot green; this - # holds the timestamp of that transition so the flash decays after ~5s. + # Stamps running→idle; topbar dot flashes green for ~5s after. _done_at: Optional[float] = None - # ask_user INTERRUPT events captured by the per-agent turn_end hook. - # Drained by the display thread when the assistant turn marks done. + # ask_user INTERRUPT events; drained by display thread on turn done. ask_user_events: Any = field(default_factory=lambda: queue.Queue()) - # Set to {question: str} after user picks the free-text option in an - # ask_user picker. The next user submission gets intercepted into a - # 2-step `Ready to submit your answer?` confirmation. + # Pending `{question:str}` after the user picks free-text in an ask_user + # picker; next submission becomes a 2-step "Ready to submit?" confirm. free_text_pending: Optional[dict] = None + # Plan state: items + grace-period timers (3s farewell, 1.5s lost-grace). + plan_items: list = field(default_factory=list) + plan_complete_since: Optional[float] = None + plan_lost_since: Optional[float] = None + # Boundary between restored history (≤ idx) and this run (> idx); + # `/continue` bumps to `len(messages)` so old plan cards don't resurrect. + plan_scan_baseline: int = 0 def default_agent_factory() -> Any: @@ -991,6 +1011,7 @@ def default_agent_factory() -> Any: ("/stop", "", "中止当前任务"), ("/llm", "[n]", "查看 / 切换模型"), ("/btw", "", "side question — 不打断主 agent"), + ("/review", "[request]", "in-session 代码审查(直接输出报告)"), ("/continue", "[n|name]", "列出 / 恢复历史会话"), ("/cost", "[all]", "显示当前会话 token 用量(all = 所有会话)"), ("/export", "clip||all", "导出最后回复"), @@ -1362,9 +1383,29 @@ async def _on_paste(self, event: events.Paste) -> None: return if self._paste_file_from_clipboard(): event.stop(); event.prevent_default(); return + # Git-bash / mintty fallback: PIL.ImageGrab can't return Image objects + # in that TTY env, but the OS clipboard does hold the file path the + # screenshot tool wrote. Treat a single-line, on-disk path as if the + # file grab had succeeded — same placeholder + `_pastes` entry. + if self._paste_file_from_text(event.text): + event.stop(); event.prevent_default(); return self._insert_paste_text(event.text) event.stop(); event.prevent_default() + def _paste_file_from_text(self, raw: str) -> bool: + if not raw: return False + path = raw.strip().strip('"').strip("'") + if "\n" in path or "\r" in path: return False + if len(path) > 1024: return False + if not os.path.isfile(path): return False + is_image = os.path.splitext(path)[1].lower() in _IMAGE_EXTS + self._paste_counter += 1 + sid = self._paste_counter + self._pastes[sid] = path + marker = f"[Image #{sid}]" if is_image else f"[File #{sid}]" + self._insert_via_keyboard(marker) + return True + async def _on_key(self, event: events.Key) -> None: # 1) command palette routing try: @@ -1799,6 +1840,10 @@ def __init__(self, agent_factory: Optional[AgentFactory] = None) -> None: self.agent_factory: AgentFactory = agent_factory or default_agent_factory self.sessions: dict[int, AgentSession] = {} self.current_id: Optional[int] = None + # Wall-clock marker used by `/cost` to scope subagent log scans to + # logs touched after the TUI started — pre-launch leftovers shouldn't + # bleed into "this run's" total. + self._started_at: float = time.time() self._ids = count(1) self._suppress_palette_open = False self.fold_mode: bool = True @@ -1834,17 +1879,21 @@ def __init__(self, agent_factory: Optional[AgentFactory] = None) -> None: "rename": self._cmd_rename, "branch": self._cmd_branch, "rewind": self._cmd_rewind, "clear": self._cmd_clear, "stop": self._cmd_stop, "llm": self._cmd_llm, "export": self._cmd_export, - "restore": self._cmd_restore, "btw": self._cmd_btw, "continue": self._cmd_continue, - "cost": self._cmd_cost, + "restore": self._cmd_restore, "btw": self._cmd_btw, "review": self._cmd_review, + "continue": self._cmd_continue, "cost": self._cmd_cost, "quit": self._cmd_quit, "exit": self._cmd_quit, } try: import cost_tracker; cost_tracker.install() except Exception: pass - # Best-effort: drop session_names entries whose log was rotated away - # (e.g. month-old logs the user deleted). Keeps the registry tidy so - # `/continue ` never resolves to a vanished file. + # Patch GenericAgent for /review in case chatapp_common didn't wire it. + try: + from agentmain import GenericAgent as _GA + import review_cmd; review_cmd.install(_GA) + except Exception: + pass + # Drop session_names entries pointing at rotated-away logs. try: import session_names; session_names.gc() except Exception: @@ -1856,6 +1905,7 @@ def compose(self) -> ComposeResult: yield Static("", id="sidebar") with Vertical(id="main"): yield VerticalScroll(id="messages") + yield Static("", id="planbar") yield OptionList(id="palette") yield InputArea( "", @@ -1869,15 +1919,19 @@ def compose(self) -> ComposeResult: # Tip line sits inside #main so it doesn't compete for height # with #body's 1fr. Content set at compose so the first frame # already shows it. - yield Static(_tip_line(), id="tipbar") + yield Static(_tip_line(_random_tip()), id="tipbar") yield Static(render_bottombar(), id="bottombar") def on_mount(self) -> None: self.add_session("main") self._system("Welcome to GenericAgent TUI. 按 / 唤起命令面板,Ctrl+N 新建会话。") + # CSS `#planbar { display: none }` keeps it hidden by default — + # the renderer flips it on once items materialize. self.query_one("#input", InputArea).focus() self.set_interval(0.5, self._tick) self._patch_auto_scroll_for_selection() + self._start_plan_watcher() + self._start_tip_rotator() self._apply_responsive_layout() # Disable alternate scroll mode (?1007). Textual enables ?1006 SGR mouse but doesn't # turn off ?1007, which on macOS Terminal / iTerm2 makes the wheel emit both mouse @@ -2837,6 +2891,46 @@ def worker(): threading.Thread(target=worker, daemon=True, name="ga-tui-btw").start() + def _cmd_review(self, args, raw): + """`/review` via TUI's streaming path; the TUI intercepts slash commands + before `review_cmd.install`'s patch, so we render the prompt via + `review_cmd.handle` and submit it as a normal task with `/review ...` + kept as the visible user message.""" + body = (raw or "").strip() + if body == "/review": + body = "" + elif body.startswith("/review ") or body.startswith("/review\t"): + body = body[len("/review"):].strip() + else: + body = " ".join(args).strip() + sess = self.current + if body in ("help", "?", "-h", "--help"): + try: + dq = queue.Queue() + rendered = review_handle(sess.agent, body, dq) + try: + item = dq.get_nowait() + self._system(str(item.get("done") or "")) + except queue.Empty: + if rendered: + self._system(rendered) + except Exception as e: + self._system(f"❌ /review help 失败: {type(e).__name__}: {e}") + return + if sess.status == "running": + self._system(f"#{sess.agent_id} 正在跑,/stop 后再发。") + return + try: + prompt = review_handle(sess.agent, body, queue.Queue()) + except Exception as e: + self._system(f"❌ /review 初始化失败: {type(e).__name__}: {e}") + return + if not prompt: + self._system("❌ /review 未生成审查提示。") + return + display_text = raw.strip() if (raw or "").strip() else "/review" + self.submit_user_message(prompt, display_text=display_text) + def _cmd_continue(self, args, raw): sess = self.current m = re.match(r"/continue\s+(\S.*?)\s*$", (raw or "").strip()) @@ -2908,8 +3002,18 @@ def _do_continue_restore(self, path: str) -> str: pass def _finish(): sess.messages.clear() + # Plan state belongs to the *previous* conversation. Clearing it + # along with messages stops the planbar from leaking stale items + # (`Plan (3/7)` from #4 qxs) into the freshly-restored session. + sess.plan_items = [] + sess.plan_complete_since = None + sess.plan_lost_since = None + self._plan_mtime.pop(sess.agent_id, None) for h in continue_extract(path): sess.messages.append(ChatMessage(role=h["role"], content=h["content"])) + # Baseline past restored history so the scanner ignores the prior + # session's plan.md; only re-shows on a fresh enter_plan_mode. + sess.plan_scan_baseline = len(sess.messages) try: import session_names nm = session_names.name_for(path) @@ -2966,21 +3070,43 @@ def _section(sid: int, sess, t) -> list[str]: f"{_k(t.cache_create)} created · " f"{t.cache_hit_rate():.1f}% hit" ) - ctx = cost_tracker.context_limit_for(model) - if ctx and t.last_input > 0: - used = t.last_input - pct_left = max(0.0, (ctx - used) / ctx * 100.0) + try: backend = sess.agent.llmclient.backend + except Exception: backend = None + cap = cost_tracker.context_window_chars(backend) if backend else 0 + used = cost_tracker.current_input_chars(backend) if backend else 0 + if cap > 0: + pct_left = max(0.0, (cap - used) / cap * 100.0) ls.append( f" Context window: {pct_left:>5.0f}% left " - f"({_k(used)} used / {_k(ctx)})" + f"({_k(used)} chars used / {_k(cap)} cap)" ) ls.append(f" Requests: {t.requests:>7}") return ls + # Scope subagent logs to this TUI run so prior-session logs don't bleed in. + try: sub = cost_tracker.scan_subagent_logs(since=getattr(self, "_started_at", 0.0)) + except Exception: sub = None + + def _sub_section() -> list[str]: + if not sub or sub.total_tokens() == 0: return [] + ls = ["", f"subagents (扫描 temp/*/stdout.log)"] + ls.append( + f" Token usage: {_k(sub.total_tokens()):>7} total " + f"({_k(sub.total_input_side())} input + {_k(sub.output)} output)" + ) + if sub.cache_read or sub.cache_create: + ls.append( + f" Cache: {_k(sub.cache_read):>7} read · " + f"{_k(sub.cache_create)} created · " + f"{sub.cache_hit_rate():.1f}% hit" + ) + ls.append(f" Requests: {sub.requests:>7}") + return ls + lines: list[str] = [] if show_all: trackers = cost_tracker.all_trackers() - if not trackers: + if not trackers and not (sub and sub.total_tokens()): lines = ["✦ Token usage", " (尚无任何 LLM 调用记录)"] else: # Resolve each thread back to a session if we still know it; otherwise @@ -3004,12 +3130,14 @@ def _section(sid: int, sess, t) -> list[str]: f"({_k(t.total_input_side())} input + {_k(t.output)} output)" ) lines.append(f" Requests: {t.requests:>7}") + lines += _sub_section() else: sess = self.current tname = sess.thread.name if sess.thread else f"ga-tui-agent-{sess.agent_id}" t = cost_tracker.get(tname) lines.append("✦ Token usage") lines += _section(sess.agent_id, sess, t) + lines += _sub_section() self._system("\n".join(lines)) def _cmd_export(self, args, raw): @@ -3122,12 +3250,9 @@ def on_unmount(self) -> None: self._reset_terminal_title() # ---------------- agent task + stream ---------------- - def submit_user_message(self, text: str, images: Optional[list[str]] = None) -> int: + def submit_user_message(self, text: str, images: Optional[list[str]] = None, display_text: Optional[str] = None) -> int: sess = self.current - # Free-text ask_user interception: route through the 2-step - # `Ready to submit your answer?` confirmation card before letting - # the agent see the answer. Only triggers when the picker armed - # `sess.free_text_pending`; the rest of the submit path is unchanged. + # Free-text ask_user answers go through a 2-step submit-confirm card. if self._maybe_intercept_free_text(sess, text): return -1 if sess.status == "running": @@ -3139,7 +3264,8 @@ def submit_user_message(self, text: str, images: Optional[list[str]] = None) -> sess.buffer = "" sess.status = "running" image_paths = list(images or []) - sess.messages.append(ChatMessage("user", text, image_paths=image_paths)) + visible_text = text if display_text is None else display_text + sess.messages.append(ChatMessage("user", visible_text, image_paths=image_paths)) sess.messages.append(ChatMessage("assistant", "", task_id=tid, done=False)) self._refresh_all() try: @@ -3183,12 +3309,12 @@ def _on_stream(self, agent_id, task_id, text, done): s.status = "idle" s.current_display_queue = None self._update_assistant(agent_id, text, task_id=task_id, done=done, refresh_chrome=True) + # End-of-turn re-parse only; mid-stream `[...]` fragments would flash. if done: + self._update_plan_state(s, text) self._drain_ask_user_events(s) - # `[多选]` / `[multi]` / `select all` in the question switches the picker to - # a multi-select widget. The flag is intentionally heuristic so existing - # ask_user calls (no schema change in core) can opt in by phrasing alone. + # Phrasing-based opt-in for multi-select picker (no core schema change). _MULTI_RE = re.compile(r"\[?(?:多选|multi(?:[-_ ]?select)?|select all)\]?", re.IGNORECASE) def _drain_ask_user_events(self, sess: AgentSession) -> None: @@ -3379,6 +3505,172 @@ def _update_assistant(self, agent_id, text, *, task_id=None, done=True, refresh_ self._refresh_topbar() self._ensure_spinner() + # ---------------- Plan/todo panel ---------------- + # State machine (graces absorb mid-stream parse misses / let final tally read): + # hidden → active(n_done/n_total) → complete(n/n) → [3s grace] → hidden + # active/complete → empty → [1.5s grace] → hidden + _PLAN_GRACE_SEC = 3.0 + _PLAN_LOST_GRACE_SEC = 1.5 + + def _update_plan_state(self, sess: AgentSession, _stream_text: str = "") -> None: + import plan_state + prev = sess.plan_items + # Detect plan mode: `working['in_plan_mode']` first, fallback to per- + # session message scan for a `plan_*/plan.md` reference. Strictly + # per-session via `plan_scan_baseline` to avoid /continue bleed. + new_items: list = [] + msgs = sess.messages + base = sess.plan_scan_baseline + if plan_state.is_active(sess.agent, messages=msgs, start_idx=base): + path = plan_state.resolve_path(sess.agent, messages=msgs, start_idx=base) + if path: + try: + with open(path, encoding="utf-8", errors="replace") as f: + new_items = plan_state.extract(f.read()) + except OSError: + new_items = [] + now_c = plan_state.is_complete(new_items) and new_items + was_c = plan_state.is_complete(prev) and prev + if now_c and not was_c: sess.plan_complete_since = time.time() + elif not now_c: sess.plan_complete_since = None + if not new_items and prev: + sess.plan_lost_since = time.time() + elif new_items: + sess.plan_lost_since = None + sess.plan_items = new_items + if sess.agent_id == self.current_id: + self._refresh_planbar() + + def _refresh_planbar(self) -> None: + try: bar = self.query_one("#planbar", Static) + except Exception: return + sess = self.sessions.get(self.current_id) if self.current_id is not None else None + items = sess.plan_items if sess else [] + if sess and sess.plan_lost_since is not None: + if time.time() - sess.plan_lost_since >= self._PLAN_LOST_GRACE_SEC: + sess.plan_items = []; sess.plan_lost_since = None; items = [] + import plan_state + msgs = sess.messages if sess else None + base = sess.plan_scan_baseline if sess else 0 + # Plan-mode armed but no items yet → placeholder (covers the + # enter_plan_mode → first plan.md write gap). + if not items: + if sess and plan_state.is_active(sess.agent, messages=msgs, start_idx=base): + self._render_planbar_placeholder(bar, sess) + return + self._set_planbar_visible(bar, False); return + n_done, n_total = plan_state.summary(items) + complete = plan_state.is_complete(items) + if complete and sess and sess.plan_complete_since is not None: + if time.time() - sess.plan_complete_since >= self._PLAN_GRACE_SEC: + self._set_planbar_visible(bar, False); return + # 5-row budget: header(1) + step(0/1) + tasks(N) + overflow(0/1). + step = plan_state.current_step(msgs, start_idx=base) + budget = 4 - (1 if step else 0) + ordered = [(c, st) for c, st in items if st != "done"] + \ + [(c, st) for c, st in items if st == "done"] + body_lines = budget - 1 if len(ordered) > budget else budget + shown = ordered[:body_lines] + overflow = max(0, len(ordered) - body_lines) + sig = (tuple(shown), overflow, step, bool(complete and sess and sess.plan_complete_since)) + if getattr(bar, "_plan_sig", None) == sig and bar.display: return + bar._plan_sig = sig + body = Text() + head = f"✓ Plan complete ({n_total}/{n_total})\n" if complete else f"📋 Plan ({n_done}/{n_total})\n" + body.append(head, style=f"bold {C_GREEN}") + if step: + body.append(" ▸ ", style=C_GREEN) + body.append(step[:120] + "\n", style=C_MUTED) + for c, st in shown: + if st == "done": body.append(" ✔ ", style=C_GREEN); body.append(c + "\n", style=C_DIM) + else: body.append(" ☐ ", style=C_DIM); body.append(c + "\n", style=C_FG) + if overflow: + body.append(f" ⋮ +{overflow} more", style=C_DIM) + bar.update(body) + self._set_planbar_visible(bar, True) + + def _render_planbar_placeholder(self, bar: Static, sess: AgentSession) -> None: + # Placeholder for armed-but-empty plan mode (pre-first plan.md write). + import plan_state + base = sess.plan_scan_baseline + path = (plan_state._stashed_plan_path(sess.agent) + or plan_state.find_path_in_messages(sess.messages, start_idx=base) + or "") + hint = "/".join(path.replace("\\", "/").rstrip("/").split("/")[-2:]) if path else "plan.md" + step = plan_state.current_step(sess.messages, start_idx=base) + sig = ("__placeholder__", hint, step) + if getattr(bar, "_plan_sig", None) == sig and bar.display: return + bar._plan_sig = sig + body = Text() + body.append("📋 Plan 模式已激活\n", style=f"bold {C_GREEN}") + if step: + body.append(" ▸ ", style=C_GREEN) + body.append(step[:120] + "\n", style=C_MUTED) + body.append(f" 等待写入 {hint} …", style=C_DIM) + bar.update(body) + self._set_planbar_visible(bar, True) + + def _set_planbar_visible(self, bar: Static, visible: bool) -> None: + # Repaint only on show→hide transition; idle ticks no-op. + if not visible: + if not bar.display: return + bar.display = False + bar.update(Text()) + bar._plan_sig = None + return + if not bar.display: bar.display = True + + def _start_plan_watcher(self) -> None: + if getattr(self, "_plan_timer", None) is not None: return + self._plan_mtime: dict = {} + try: self._plan_timer = self.set_interval(1.0, self._poll_plan_files) + except Exception: pass + + def _poll_plan_files(self) -> None: + # Poll only the visible session — background sessions don't paint planbar. + import plan_state + sess = self.sessions.get(self.current_id) if self.current_id is not None else None + if sess is None: return + msgs = sess.messages + base = sess.plan_scan_baseline + if not plan_state.is_active(sess.agent, messages=msgs, start_idx=base): + self._refresh_planbar(); return + path = plan_state.resolve_path(sess.agent, messages=msgs, start_idx=base) + if not path: + self._refresh_planbar(); return + try: mtime = os.path.getmtime(path) + except OSError: + self._refresh_planbar(); return + if self._plan_mtime.get(sess.agent_id) != mtime: + self._plan_mtime[sess.agent_id] = mtime + self._update_plan_state(sess); return + self._refresh_planbar() # tick grace timers + + # ---------------- Tip rotation ---------------- + # 12s show → 1s blank → next tip. + _TIP_SHOW_SEC = 12.0 + _TIP_BLANK_SEC = 1.0 + + def _start_tip_rotator(self) -> None: + if getattr(self, "_tip_timer", None) is not None: return + self._tip_current: str = "" + try: self._tip_timer = self.set_interval(self._TIP_SHOW_SEC, self._rotate_tip) + except Exception: pass + + def _rotate_tip(self) -> None: + try: bar = self.query_one("#tipbar", Static) + except Exception: return + bar.update(_tip_line("")) # blank pulse + nxt = _random_tip(exclude=self._tip_current) + self._tip_current = nxt + try: self.set_timer(self._TIP_BLANK_SEC, lambda: self._show_tip(nxt)) + except Exception: self._show_tip(nxt) + + def _show_tip(self, tip: str) -> None: + try: bar = self.query_one("#tipbar", Static) + except Exception: return + bar.update(_tip_line(tip)) + # ---------------- UI refresh ---------------- def _system(self, text: str) -> None: if self.current_id is None: return @@ -3391,6 +3683,7 @@ def _refresh_all(self): self._refresh_topbar() self._refresh_sidebar() self._refresh_messages() + self._refresh_planbar() self._ensure_spinner() def _swap_input_for_session(self) -> None: @@ -3655,23 +3948,23 @@ def cached_render(content: str) -> "_MdRender": _SPINNER_FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏" - # Easter-egg gerunds rotated through the spinner annotation — keeps the - # streaming wait feeling alive rather than dead-frozen. + # Spinner gerund pool (stable per-message via id-hash; separate from _DONE_GERUNDS). _SPINNER_GERUNDS = ( "Pondering", "Reticulating", "Sleuthing", "Hatching", "Pouncing", "Brewing", "Sharpening", "Untangling", "Compiling", "Unraveling", "Distilling", "Calibrating", "Marinating", "Conjuring", "Foraging", "Spelunking", "Synthesizing", "Refactoring thoughts", "Tracing breadcrumbs", "Following the rabbit hole", + "Routing", "Threading", "Polling", "Spinning", "Hooking", + "Patching", "Caching", "Yielding", "Hydrating", "Folding", + "Streaming", "Resolving", "Reaping", "Tuning", ) def _spinner_glyph(self) -> str: return self._SPINNER_FRAMES[self._spinner_frame % len(self._SPINNER_FRAMES)] def _spinner_gerund(self, m) -> str: - # Stable per-message: rotate by message identity hash so the gerund - # doesn't strobe with every spinner frame. ID-keyed avoids ChatMessage - # __hash__ requirements and survives content mutation. + # ID-hashed → stable per-message; survives content mutation. idx = (id(m) // 16) % len(self._SPINNER_GERUNDS) return self._SPINNER_GERUNDS[idx] @@ -3684,33 +3977,94 @@ def _humanize_tokens(n: int) -> str: return f"{n / 1_000_000.0:.2f}M" def _spinner_annotation(self, m) -> Text: - """Render `⠋ Gerund... (Xm Ys · ↑ N.Nk tokens)` for a streaming message. - The gerund hue shifts with elapsed + token deltas (see _gerund_color).""" + """Render `⠋ Gerund… (Xm Ys · ↑ N · ↓ M)` for a streaming message. + ↑/↓ are the latest LLM call's prompt / completion sizes, gated on + cumulative counters moving past the baselines captured at stream start + (otherwise the prior turn's tail values leak in on prompt submit). + """ out = Text() elapsed = int(time.time() - m._stream_started_at) if m._stream_started_at else 0 - delta_in = 0 - try: - import cost_tracker - sess = self.sessions.get(self.current_id) - tname = sess.thread.name if sess and sess.thread else f"ga-tui-agent-{self.current_id}" - t = cost_tracker.get(tname) - delta_in = max(0, t.input + t.cache_create + t.cache_read - m._stream_baseline_input) - except Exception: - pass - gerund_style = _gerund_color(elapsed, delta_in) + last_in, last_out = self._live_call_tokens(m) + gerund_style = _gerund_color(elapsed, last_in) out.append(self._spinner_glyph(), style=gerund_style) out.append(f" {self._spinner_gerund(m)}…", style=gerund_style) bits = [] if m._stream_started_at: bits.append(_fmt_elapsed(elapsed)) - if delta_in > 0: - bits.append(f"↑ {self._humanize_tokens(delta_in)} tokens") + if last_in > 0 or last_out > 0: + bits.append(f"↑ {self._humanize_tokens(last_in)} · ↓ {self._humanize_tokens(last_out)}") if bits: out.append(" (", style=C_DIM) out.append(" · ".join(bits), style=C_DIM) out.append(")", style=C_DIM) return out + def _live_call_tokens(self, m) -> tuple: + """`(last_in, last_out)` for this turn, gated on cumulative deltas past + the per-message baselines. Returns zeros until the new turn moves + the counters. Shared by spinner + done-card.""" + last_in = last_out = 0 + try: + import cost_tracker + sess = self.sessions.get(self.current_id) + tname = sess.thread.name if sess and sess.thread else f"ga-tui-agent-{self.current_id}" + t = cost_tracker.get(tname) + cum_in = t.input + t.cache_create + t.cache_read + cum_out = t.output + if cum_in > m._stream_baseline_input: last_in = t.last_input + if cum_out > m._stream_baseline_output: last_out = t.last_output + except Exception: + pass + return last_in, last_out + + # Settled-state braille pairs with the spinner frames (⠋…⠏ → ⠿). + _DONE_GLYPH = "⠿" + + # Past-tense pool for the post-turn card; reads "{Verb} for Xm Ys". + _DONE_GERUNDS = ( + "Churned", "Ruminated", "Brewed", "Cooked", "Marinated", "Percolated", + "Distilled", "Crystallized", "Synthesized", "Sharpened", "Conjured", + "Pondered", "Spelunked", "Untangled", "Foraged", "Hatched", "Pounced", + "Sleuthed", "Unraveled", "Calibrated", "Mused", "Schemed", "Tinkered", + "Forged", "Simmered", "Steeped", + "Threaded", "Folded", "Patched", "Streamed", "Cached", "Hooked", + "Routed", "Resolved", "Yielded", "Hydrated", "Reaped", "Tuned", + "Plotted", "Reviewed", "Audited", "Verified", "Adjudicated", + "Conducted", "Orchestrated", + "Mapped", "Reduced", "Dispatched", + "Recalled", "Stashed", "Indexed", + ) + + def _done_gerund(self, m) -> str: + # Stable per-message — id-hash so re-mount (theme / resize / fold) keeps + # the verb; spinner uses a separate pool so live/settled never collide. + idx = (id(m) // 16) % len(self._DONE_GERUNDS) + return self._DONE_GERUNDS[idx] + + def _done_annotation(self, m) -> Text: + """Render `⠿ {Verb} for Xm Ys · ↑ N · ↓ M` after a turn finishes. + Numbers frozen via `_done_summary` so re-mounts / next turn don't + shift the line.""" + elapsed, last_in, last_out = m._done_summary or (0, 0, 0) + verb = self._done_gerund(m) + out = Text() + out.append(self._DONE_GLYPH + " ", style=C_GREEN) + out.append(f"{verb} for {_fmt_elapsed(int(elapsed))}", style=C_DIM) + if last_in > 0 or last_out > 0: + out.append(" · ", style=C_DIM) + out.append(f"↑ {self._humanize_tokens(last_in)} · ↓ {self._humanize_tokens(last_out)}", + style=C_DIM) + return out + + def _capture_done_summary(self, m) -> None: + """Freeze `(elapsed, last_in, last_out)` once when an assistant message + transitions done→True. Idempotent — repeat calls are no-ops so re-mounts + and stream-update passes won't overwrite the snapshot.""" + if m._done_summary is not None or not m.done: return + elapsed = (time.time() - m._stream_started_at) if m._stream_started_at else 0.0 + last_in, last_out = self._live_call_tokens(m) + m._done_summary = (elapsed, last_in, last_out) + def _has_streaming(self) -> bool: if self.current_id is None: return False @@ -3742,8 +4096,8 @@ def _spinner_tick(self) -> None: def _mark_stream_start(self, m) -> None: """Lazily timestamp a streaming message so the spinner can show elapsed/tokens. - Snapshots the current input-side token total as a baseline so the displayed - delta reflects *this* turn only.""" + Snapshots both input-side and output-side token totals as baselines so + the spinner's `↑ N · ↓ M` reflects *this* turn only.""" m._stream_started_at = time.time() try: import cost_tracker @@ -3751,8 +4105,10 @@ def _mark_stream_start(self, m) -> None: tname = sess.thread.name if sess and sess.thread else f"ga-tui-agent-{self.current_id}" t = cost_tracker.get(tname) m._stream_baseline_input = t.input + t.cache_create + t.cache_read + m._stream_baseline_output = t.output except Exception: m._stream_baseline_input = 0 + m._stream_baseline_output = 0 @staticmethod def _segment_sig(segs: list[tuple]) -> tuple: @@ -3833,13 +4189,31 @@ def _mount_assistant_segments(self, container, m: ChatMessage, segs: list[tuple] self._sync_spinner_widget(container, m, anchor) def _sync_spinner_widget(self, container, m: ChatMessage, anchor) -> None: - """Spinner is a tiny dedicated Static after segment widgets — outside Markdown - so unclosed code fences / paragraph trimming can't eat it. Mounted iff streaming.""" + """Tiny dedicated Static after segment widgets — outside Markdown so + unclosed code fences / paragraph trimming can't eat it. While streaming + shows the spinner annotation; once `m.done` flips True, the same widget + becomes the post-turn `⠿ Churned for Xm Ys` card (frozen via + `_capture_done_summary`).""" if m.done: - if m._spinner_widget is not None: - try: m._spinner_widget.remove() + # `_stream_started_at` is the marker that this message was actually + # streamed in this TUI session. Restored /continue history flips + # done=True without ever streaming, so skip the card there — a + # "⠿ Churned for 0s" badge under every archived turn is just noise. + if m._stream_started_at is None: + if m._spinner_widget is not None: + try: m._spinner_widget.remove() + except Exception: pass + m._spinner_widget = None + return + self._capture_done_summary(m) + if m._spinner_widget is None: + w = Static(self._done_annotation(m), classes="msg spinner") + if anchor is None: container.mount(w) + else: container.mount(w, after=anchor) + m._spinner_widget = w + else: + try: m._spinner_widget.update(self._done_annotation(m)) except Exception: pass - m._spinner_widget = None return if m._spinner_widget is None: if m._stream_started_at is None: @@ -3878,9 +4252,10 @@ def _stream_update_assistant(self, m: ChatMessage) -> None: last_widget._ga_render = None last_widget.update(Text(last_text, style=C_FG)) if m.done and m._spinner_widget is not None: - try: m._spinner_widget.remove() + # Convert the live spinner into the post-turn ⠿ card in place. + self._capture_done_summary(m) + try: m._spinner_widget.update(self._done_annotation(m)) except Exception: pass - m._spinner_widget = None return self._remount_assistant_message(m) @@ -3925,8 +4300,37 @@ def build_arg_parser() -> argparse.ArgumentParser: return argparse.ArgumentParser(description="GenericAgent TUI v2 (refined visual style)") +def _warn_mintty(): + """Warn only for direct Git Bash/mintty, not Git Bash inside Windows Terminal.""" + if sys.platform != 'win32': + return + # Direct Git Bash uses mintty. Git Bash hosted by Windows Terminal still sets + # MSYSTEM, but has WT_SESSION and renders Textual correctly, so do not block it. + term_prog = os.environ.get('TERM_PROGRAM', '').lower() + wt_session = os.environ.get('WT_SESSION', '') + direct_mintty = term_prog == 'mintty' and not wt_session + if direct_mintty: + print( + "\033[33m[ga-tui] WARNING: direct Git Bash/mintty detected.\033[0m\n" + " Textual TUI requires a modern terminal with full VT/xterm support.\n" + " Direct mintty can cause rendering issues (blank screen, garbled output).\n" + "\n" + " Recommended alternatives:\n" + " - Windows Terminal Git Bash: wt -p \"Git Bash\" python frontends/tuiapp_v2.py\n" + " - Windows Terminal: wt python frontends/tuiapp_v2.py\n" + " - CMD: python frontends\\tuiapp_v2.py\n" + " - PowerShell: python frontends/tuiapp_v2.py\n" + "\n" + " To continue anyway, set GA_TUI_FORCE=1", + file=sys.stderr, + ) + if not os.environ.get('GA_TUI_FORCE'): + raise SystemExit(1) + + def main(argv: Optional[list[str]] = None) -> int: build_arg_parser().parse_args(argv) + _warn_mintty() GenericAgentTUI().run() return 0 diff --git a/llmcore.py b/llmcore.py index 3ceaa0c1..2e239d32 100644 --- a/llmcore.py +++ b/llmcore.py @@ -297,12 +297,14 @@ def _record_usage(usage, api_mode): if not usage: return if api_mode == 'responses': cached = (usage.get("input_tokens_details") or {}).get("cached_tokens", 0) - inp = usage.get("input_tokens", 0) + inp = usage.get("input_tokens", 0); out = usage.get("output_tokens", 0) print(f"[Cache] input={inp} cached={cached}") + if out: print(f"[Output] tokens={out}") elif api_mode == 'chat_completions': cached = (usage.get("prompt_tokens_details") or {}).get("cached_tokens", 0) - inp = usage.get("prompt_tokens", 0) + inp = usage.get("prompt_tokens", 0); out = usage.get("completion_tokens", 0) print(f"[Cache] input={inp} cached={cached}") + if out: print(f"[Output] tokens={out}") elif api_mode == 'messages': ci, cr, inp = usage.get("cache_creation_input_tokens", 0), usage.get("cache_read_input_tokens", 0), usage.get("input_tokens", 0) print(f"[Cache] input={inp} creation={ci} read={cr}") diff --git a/memory/review_sop.md b/memory/review_sop.md new file mode 100644 index 00000000..f3d719d0 --- /dev/null +++ b/memory/review_sop.md @@ -0,0 +1,169 @@ +# Review Mode SOP + +> In-session adversarial code reviewer。用 `/review` 触发,主 agent 在当前对话内 +> 拉起评审,报告直接 echo 到对话,**不开 subagent / 不落盘 / 不打 sentinel**。 + +--- + +## 一、何时使用 + +用户输入 `/review` 命令,或自然语言要求"code review"时启用。 +典型用例:作者刚写完一段代码 → `/review` 对自己的改动做对抗性 review。 + +--- + +## 二、快速启动 + +| 命令 | 行为 | +|---|---| +| `/review` | 默认审本次 uncommitted 改动(主 agent 跑 `git diff --stat HEAD` + `git diff HEAD`) | +| `/review <自然语言请求>` | 按描述的范围去审(可指定文件 / 目录 / 任务) | +| `/review help` | 显示用法 | + +**非 git 仓库**:主 agent 提示用户在下一句 `/review` 塞入具体路径或范围,本轮结束。 + +--- + +## 三、入口文件 + +``` +任意前端 (TUI / Streamlit / wechat / desktop) + └─ frontends/review_cmd.py ← 命令分发,剥 "/review" 前缀,注入 user_request + └─ memory/review_sop/review_inline_prompt.txt ← 完整 in-session 协议 + └─ memory/code_review_principles.md ← 15 条好代码原则 +``` + +- `review_cmd.py:install()` —— monkey-patch `GenericAgent._handle_slash_cmd`,统一接管 `/review` +- `review_cmd.py:_render_prompt()` —— 加载 prompt 模板,注入 `{user_request}` + `{ga_root}` + +--- + +## 四、三条铁律(reviewer 顶部硬约束,不可违反) + +1. **Review-only 只读评审** —— 评审与报告而已。**禁止**修改源文件、调 + file_write / file_patch / code_run 改业务代码、在产出里写"我接下来去修一下" + 或暗示要动手。 +2. **Challenge the approach, 不仅找 bug** —— 先问"这条路本身对不对?"再问 + "实现有没有 bug?":挖隐含假设、评估真实环境故障模式(Windows 路径 / 代理失活 / + 并发写 / UTF-8 边界 / token 预算耗尽)。 +3. **报告输出完即结束** —— 不复述用户目标、不做 meta 评论、不承诺 follow-up; + 报告 markdown 直接 echo 到对话,**不落盘 review.md、不打 `[ROUND END]`**。 + +--- + +## 五、工作流(5 步,顺序走) + +### 步骤 1:必读底料 + +`file_read("memory/code_review_principles.md")` —— 15 条好代码原则,**每条 finding 必须 +能映射到其中一条**。 + +### 步骤 2:锁定审阅范围 + +| 用户输入 | 范围 | +|---|---| +| 点名了文件 / 目录 | 审那些 | +| 描述了任务范围 | `code_run` 跑 `git status -s` + `git diff --stat HEAD` + `git diff HEAD` | +| 空 / 模糊 | 默认审本次 uncommitted 改动 | +| 非 git 仓库 | 提示用户塞路径,本轮结束 | + +**先把范围列出来发给用户确认**,再开始 `file_read`。 + +### 步骤 3:逐文件 file_read + +超过 800 行分段读。优先看 diff 涉及的行,再看上下文与接口调用方。 + +### 步骤 4:回答 Q1-Q4 对抗性 framing + +- **Q1: Is this the right approach?** — 有没有更简单 / 更标准 / 更安全的实现路径? +- **Q2: What hidden dependencies could fail?** — OS / shell / 网络 / 并发 / 第三方 API 任一失效? +- **Q3: What edge / hostile input breaks it?** — 空值、UTF-8、Windows 路径、超长输入、过期 token。 +- **Q4: Is the failure mode observable & recoverable?** — 仅看日志能不能定位?能不能不动手就恢复? + +### 步骤 5:列 P0~P3 findings + +遵守 §七 防误报八规则 + §八 措辞八规范。提交前过自检清单(§九)。 + +--- + +## 六、Severity / Verdict 速查 + +| Level | 定义 | 例子 | +|---|---|---| +| **P0** | 阻塞:破坏正确性 / 丢数据 / 安全漏洞 / 不可逆故障 | 路径穿越、SQL 注入、密钥落日志、并发竞态破坏数据 | +| **P1** | 高危:契约破坏 / 用户可见错误,但不会立即崩 | 错误只 print 不抛、超时未设、API schema 不一致 | +| **P2** | 维护性:可读性 / 命名 / 测试空缺 | 函数 > 80 行、duplicate logic、注释与代码不符 | +| **P3** | 风格 / 微优化 / 可选改进 | 命名小调整、常量提取、import 顺序 | + +**Verdict 决议**:任一 P0 → `FAIL`;无 P0 但 ≥ 1 P1 → `CONDITIONAL`;仅 P2/P3 或 0 finding → `PASS`。 + +--- + +## 七、防误报八规则(成本低到高,任一答 No → 删 finding) + +1. **Discrete & actionable** — 有具体可写的修复? +2. **Introduced or exposed by this change** — 本次改动引入或放大? +3. **Not an intentional design choice** — 不是作者刻意取舍? +4. **Provably affected, not speculated** — 跨文件影响能指出调用栈? +5. **Evidence-anchored** — 行号 / 代码片段 / 复现至少一项? +6. **No unstated assumptions** — 不依赖未明说的"应该这样"? +7. **Author would likely fix if made aware** — 作者会同意修? +8. **Impact meaningful + proportionate rigor** — 影响足够 + 严谨度匹配代码库? + +> 每条规则的展开详见 `memory/review_sop/review_inline_prompt.txt` §5。 + +--- + +## 八、措辞八规范 + +1. **Why-first** — 第一句给原因。 +2. **严重度准确** — 不要把 P2 写得像 P0。 +3. **简洁** — `evidence` / `impact` / `fix` 各 ≤ 1 段。 +4. **少贴大段代码** — `evidence` 代码 ≤ 5 行,超过用 `file:line-line` 引用。 +5. **触发条件显式** — `impact` 首句必带场景 / 输入 / 环境。 +6. **不卑不亢** — 直陈事实,无情绪 / 无开场白。 +7. **即读即懂** — 核心结论放第一句。 +8. **零奉承** — 不写 "Great work, but...", "Thanks for the changes, however..."。 + +> 展开详见 `memory/review_sop/review_inline_prompt.txt` §6。 + +--- + +## 九、输出协议(整段 echo,不落盘) + +``` +## Scope +<一行一个文件,绝对路径或仓库相对路径> + +## Verdict +PASS / CONDITIONAL / FAIL + +## Summary +3-6 行散文:整体印象 + 最重要的 1-2 个风险。 + +## Design Challenge (Q1-Q4) +- **Q1 是不是对的方法**: <证据> +- **Q2 隐藏依赖**: <证据> +- **Q3 边缘 / 敌意输入**: <证据> +- **Q4 故障可观测**: <证据> + +## Findings (P0 → P3 顺序) +- **[P0, conf=0.9] file:line-line** 标题(动词开头,≤ 80 字,第一句给原因) + - **Evidence**: 代码片段 ≤ 5 行 或 file:N-M 引用 + - **Impact**: 触发场景 + 后果(第一句必带场景) + - **Fix**: 可直接照做的修复思路,≤ 1 段 + - **Principle**: 对应 code_review_principles 第 N 条 + +## Cross-file notes +跨文件耦合 / 命名一致性 / 状态机 / 并发问题。无则 `(none)`。 + +## Regression tests +3-5 条具体测试点(输入 / 预期 / 边界)。 +``` + +--- + +## 十、扩展点 + +- **自定义评审条目**:编辑 `memory/code_review_principles.md`,reviewer 启动时整段注入 +- **触发更换**:要把 `/review` 改成别的命令,只动 `frontends/review_cmd.py` 的 `install()` 一处 diff --git a/memory/review_sop/review_inline_prompt.en.txt b/memory/review_sop/review_inline_prompt.en.txt index 110296f0..de098334 100644 --- a/memory/review_sop/review_inline_prompt.en.txt +++ b/memory/review_sop/review_inline_prompt.en.txt @@ -1,58 +1,141 @@ [/review in-session] # Role & Boundary + You are the adversarial code reviewer running **in this session**. You do **not** spawn a subagent; continue the conversation here and **echo your report directly into the chat** as the final reply. -- **Read-only**: do NOT call file_write / file_patch on business code; do NOT promise “I'll fix it next”. + +- **Read-only**: do NOT call file_write / file_patch on business code; do NOT promise "I'll fix it next". - **No review.md**: do NOT write a file to disk; do NOT print `[ROUND END]`. - **Done after the report**: no further tool calls. -# User request (this round) +⚠ Challenge the approach, not just defects: ask "is this path right?" before "does the implementation have bugs?". Surface implicit assumptions; evaluate real-world failure modes (Windows paths, dead proxies, concurrent writes, UTF-8 boundaries, expired tokens). + +--- +# 1. User request (this round) + {user_request} -# Workflow (in order) +--- +# 2. Workflow (in order, no skipping) ## Step 1: mandatory reading -1. `file_read("{ga_root}/memory/code_review_principles.md")` — 15 good-code principles; every finding must map to one. -2. This `/review` only reads `memory/review_sop/` and `memory/code_review_principles.md`; do not reference other workflow prompts. + +`file_read("{ga_root}/memory/code_review_principles.md")` — 15 good-code +principles; **every finding must map to one**. This `/review` only reads +`memory/review_sop/` and `memory/code_review_principles.md`; do not pull +from other workflow prompts. ## Step 2: lock the review scope + Resolve the user request by priority: -1. User named files / dirs explicitly → review those. -2. User described a task scope → run `git status -s`, `git diff --stat HEAD`, `git diff HEAD`, and `git log --oneline -5` if needed. -3. Empty / vague request → default to the current uncommitted diff: run `git diff --stat HEAD` and `git diff HEAD`. -4. If git fails and scope is still unclear → tell the user to provide file paths or a concrete scope in the next `/review`, then stop. -Do not ask for extra confirmation after locking the scope; list the actual scope in the final report. +1. User named files / dirs explicitly → review those; +2. User described a task scope → run `git status -s`, `git diff --stat HEAD`, + `git diff HEAD`, and `git log --oneline -5` if needed; +3. Empty / vague request → default to the uncommitted diff: run + `git diff --stat HEAD` + `git diff HEAD`; +4. If git fails and scope is still unclear → tell the user to provide file + paths or a concrete scope in the next `/review`, then stop. + +**Do not ask for extra confirmation** after locking; list the actual scope +in the final report. ## Step 3: file_read each reviewed file -> Split files over 800 lines. Start with diff-touched lines, then surrounding context and callers. -## Step 4: answer Q1-Q4 (adversarial framing) -- Q1 right approach? Is there a simpler / standard / safer path? -- Q2 hidden dependencies? What happens if OS, shell, network, concurrency, input, or third-party APIs fail? -- Q3 edge / hostile input? Empty, huge, encoded, path-like, permission-denied, repeated calls? -- Q4 failure observability? Are failures explicit, localizable, and reproducible? +Split files over 800 lines. **Prioritize diff-touched lines**, then +surrounding context and callers. -## Step 5: list P0-P3 findings -Each finding must pass the eight false-positive checks: -1. discrete and localizable; 2. introduced or exposed in scope; 3. not clearly intentional; 4. real affected path; 5. anchored to code/log evidence; 6. no unstated assumptions; 7. author would plausibly fix it; 8. impact matches severity. +## Step 4: Q1-Q4 adversarial framing (at least 1 concrete evidence per Q) -Writing rules: why-first, accurate, brief, no large code dumps, explicit trigger scenario, matter-of-fact, immediately graspable, no flattery. +- **Q1: Is this the right approach?** — Is there a simpler / standard / + safer path? Which implicit assumptions does the current path rely on? +- **Q2: What hidden dependencies could fail?** — OS / shell / network / + concurrency / user input / third-party API — what if any one fails? +- **Q3: What edge / hostile input breaks it?** — empty values, UTF-8 + boundaries, Windows paths, oversized strings, concurrent writes, expired + tokens, dead proxies. +- **Q4: Is the failure mode observable & recoverable?** — Can logs alone + localize the fault? Can it recover without manual action? -# Severity -- **P0** blocker: correctness break / data loss / security hole / irreversible failure -- **P1** high: contract break / user-visible error / likely near-term failure -- **P2** maintainability: naming / readability / future bug risk, not currently broken -- **P3** style / micro-optimization / optional improvement +## Step 5: list P0-P3 findings -# Output protocol (echo this structure into the chat) +Per §4 Severity / §5 false-positive rules / §6 wording rules. **Every +finding must pass §5 — any No → drop**; every finding's wording must +follow §6. + +--- +# 3. Severity (strict, don't invent) + +| Level | Definition | Examples | +|---|---|---| +| **P0** | Blocker: correctness break / data loss / security hole / irreversible failure | Path traversal unchecked, SQL injection, secret in log, race breaks data, unhandled exception swallowing critical finally | +| **P1** | High: contract break / user-visible error, but not immediate crash | Error handling print-not-raise, missing timeout, API schema mismatch, hardcoded config | +| **P2** | Maintainability: readability / naming / test gaps that raise future bug risk | Function > 80 lines, duplicate logic, comment-vs-code mismatch, missing test coverage | +| **P3** | Style / micro-optimization | Naming tweak, constant extraction, import order | + +--- +# 4. Verdict rule (strict) + +| Trigger | Verdict | +|---|---| +| Any P0 | **FAIL** | +| No P0, ≥ 1 P1 | **CONDITIONAL** | +| Only P2/P3 or zero findings | **PASS** | + +--- +# 5. False-positive checks (cost-low to cost-high; any No → drop the finding) + +1. **Discrete & actionable** — Is there a concrete fix to write? "Not + elegant overall" is not a finding; tangled small issues should be split. +2. **Introduced or exposed by this change** — Was it introduced or + amplified by this change? Don't dig up legacy bugs; if a pre-existing + bug is amplified, mark it `pre-existing, exposed by this change`. +3. **Not an intentional design choice** — Don't treat the author's + deliberate trade-off as a bug: kept-for-compat layers, intentionally + loose try/except fallbacks, style choices — these are not bugs. +4. **Provably affected, not speculated** — Cross-file impact must point to + **the specific call stack** that breaks. Pure speculation "this might + affect X" doesn't count. +5. **Evidence-anchored** — Line numbers, code snippets, or repro commands + — at least one. Drop "looks", "should", "maybe". +6. **No unstated assumptions** — Don't rely on unspecified "the codebase + should be X" conventions; if the finding requires assuming author + intent → drop. +7. **Author would likely fix if made aware** — Would the author agree to + fix? Don't pack P1 with extreme assumptions like "100M QPS would melt". +8. **Impact meaningful + proportionate rigor** — Impact must touch + accuracy / performance / security / maintainability; and don't exceed + the codebase's own rigor level (a one-shot script repo doesn't need + PR-level comments and input validation). + +--- +# 6. Finding wording rules (apply to title / body / evidence / impact / fix) + +1. **Why-first** — first sentence gives the reason, no preamble. +2. **Severity accurate** — don't write a P2 like a P0; if the trigger is + narrow, call it out in `impact` immediately. +3. **Brief** — `evidence` / `impact` / `fix` ≤ 1 paragraph each; don't + hard-wrap prose unless a code snippet needs it. +4. **Don't dump big code** — `evidence` snippets ≤ 5 lines; longer → + reference as `file:line-line` instead of pasting. +5. **Explicit trigger** — `impact`'s first sentence names the + **scenario / input / environment** ("when the Windows path contains + CJK chars..."), don't make the reader infer. +6. **Matter-of-fact** — state facts; no "obviously", "terrible", + "stupid"; no "thanks for the changes", "great work" openers either. +7. **Immediately graspable** — main conclusion in the first sentence; + re-write any reading-twice finding. +8. **Zero flattery** — no "Great work, but...", "Thanks for the changes, + however...". Go straight to the finding body. + +--- +# 7. Output protocol (echo this structure into the chat) ## Scope -List reviewed files, one path per line. +List reviewed files, one absolute or repo-relative path per line. ## Verdict -PASS / CONDITIONAL / FAIL -> Rule: any P0 → FAIL; no P0 but ≥ 1 P1 → CONDITIONAL; only P2/P3 or zero finding → PASS. +PASS / CONDITIONAL / FAIL — per §4. ## Summary 3-6 prose lines: what you read, overall impression, top 1-2 risks. @@ -63,22 +146,36 @@ PASS / CONDITIONAL / FAIL - **Q3 edge / hostile input**: - **Q4 failure observability**: -## Findings (P0 → P3) -For each finding: -- **[P0, conf=0.9] file:line-line** title (imperative, ≤ 80 chars) +## Findings (P0 → P3 order) +For each: +- **[P0, conf=0.9] file:line-line** title (imperative verb, ≤ 80 chars, + first sentence gives the reason) - **Evidence**: code snippet ≤ 5 lines OR file:N-M reference - - **Impact**: trigger scenario + consequence (first sentence names scenario/input/env) + - **Impact**: trigger scenario + consequence (first sentence names + scenario / input / env) - **Fix**: directly-actionable patch sketch, ≤ 1 paragraph - **Principle**: maps to code_review_principles #N ## Cross-file notes -Coupling / naming / state machine / concurrency. `(none)` if nothing. +Coupling / naming consistency / state machine / concurrency. `(none)` if +nothing. ## Regression tests 3-5 concrete test points (input / expected / boundary). -# Self-check -- Every finding passes the false-positive and writing rules -- `confidence_score` is honest: real bug → ≥ 0.8; uncertain → < 0.5 -- Verdict matches the rule -- No flattery / opener / goal-paraphrase / promise to fix +--- +# 8. Self-check (run before submitting) + +- [ ] `code_review_principles.md` was file_read +- [ ] Every reviewed file was file_read at least once +- [ ] All four `Design Challenge` fields have concrete evidence, not + hand-waving +- [ ] Every finding passes §5 false-positive rules (discrete / introduced / + not-intentional / provably-affected / evidence-anchored / + no-unstated-assumptions / would-fix / impact-meaningful) +- [ ] Every finding follows §6 wording rules (why-first / accurate / brief / + no-big-code / scenario-explicit / matter-of-fact / + immediately-graspable / no-flattery) +- [ ] `confidence_score` honest: real bug → ≥ 0.8; uncertain → < 0.5 +- [ ] Verdict matches §4 rule +- [ ] No flattery / opener / goal-paraphrase / promise to fix diff --git a/memory/review_sop/review_inline_prompt.txt b/memory/review_sop/review_inline_prompt.txt index ebeebaac..49a4dbf8 100644 --- a/memory/review_sop/review_inline_prompt.txt +++ b/memory/review_sop/review_inline_prompt.txt @@ -1,84 +1,161 @@ [/review in-session] # 角色与边界 -你是当前 session 内的 adversarial code reviewer。**你不切到独立 subagent**,就在这条对话里继续工作,把审阅报告**直接 echo 到对话**——这就是给用户的最终回答。 -- **只读**:禁止 file_write / file_patch 任何业务代码;禁止承诺“我下面去修”。 -- **不写 review.md**:不要写文件落盘,也不要在末尾打 `[ROUND END]`。 -- **报告输出完即结束**:不再调任何工具。 -# 本轮用户请求 -{user_request} - -# 工作流(顺序执行) - -## 步骤 1:必读底料 -1. file_read("{ga_root}/memory/code_review_principles.md") —— 15 条好代码原则,每条 finding 必须能映射到其中一条。 -2. 本轮 `/review` 只读取 `memory/review_sop/` 与 `memory/code_review_principles.md`,不引用其他工作流 prompt。 - -## 步骤 2:锁定审阅范围 -按优先级解析“本轮用户请求”: -1. 用户明确点名文件 / 目录 → 审那些。 -2. 用户描述任务范围 → 用 `code_run` 跑 `git status -s`、`git diff --stat HEAD`、`git diff HEAD`,必要时看 `git log --oneline -5`。 -3. 用户请求为空 / 模糊 → 默认审本次 uncommitted diff:跑 `git diff --stat HEAD` 与 `git diff HEAD`。 -4. git 失败且范围仍不可判定 → 告诉用户“请在下一句 `/review` 塞入文件路径或具体范围”,本轮结束。 +你是当前 session 内的 adversarial code reviewer。**你不切到独立 subagent**,就在这条对话里继续工作,把审阅报告**直接 echo 到对话**——这就是给用户的最终回答。 -锁定范围后不要额外 ask_user;在最终报告的 Scope 中列清楚实际审了什么。 +- **只读**:禁止 file_write / file_patch 任何业务代码;禁止承诺"我下面去修"。 +- **不写 review.md**:不要写文件落盘,也不要在末尾打 `[ROUND END]`。 +- **报告输出完即结束**:不再调任何工具。 -## 步骤 3:逐文件 file_read -> 800 行以上分段读。优先看 diff 涉及的行,再看上下文与接口调用方。 +⚠ Challenge the approach,不仅找 bug:先问"这条路本身对不对?",再问"实现有没有 bug?"。 +挖隐含假设,评估真实环境故障模式(Windows 路径 / 代理失活 / 并发写 / UTF-8 边界 / 过期 token)。 -## 步骤 4:回答 Q1-Q4(对抗性 framing) -- Q1 是不是对的方法?有没有更简单 / 标准 / 安全的路径? -- Q2 隐藏依赖?OS、shell、网络、并发、用户输入、第三方 API 任一失效会怎样? -- Q3 边缘 / 敌意输入?空值、超长、编码、路径、权限、重复调用会怎样? -- Q4 故障可观测?失败是否显式、可定位、可复现? +--- +# 1. 本轮用户请求 -## 步骤 5:列 P0-P3 findings -每条 finding 必须先通过防误报八规则: -1. 问题是离散、可定位的;2. 是本次范围内引入或暴露的;3. 不是明显有意设计;4. 有真实受影响路径;5. 证据锚定到代码 / 日志;6. 不依赖未说明假设;7. 作者看到会愿意修;8. 影响与 severity 匹配。 - -措辞规则:why-first、准确、简短、少贴大段代码、明确触发场景、就事论事、一眼可懂、不要奉承。 - -# Severity 速查 -- **P0** 阻塞:破坏正确性 / 丢数据 / 安全洞 / 不可逆故障 -- **P1** 高危:契约破坏 / 用户可见错误 / 不立即崩但很快出事 -- **P2** 维护性:命名 / 可读性,未来 bug 概率高但当前不破 -- **P3** 风格 / 微优化 / 可选改进 +{user_request} -# 输出协议(整段 echo 到对话) +--- +# 2. 工作流(顺序执行,禁止跳读) + +## 步骤 1:必读底料 + +`file_read("{ga_root}/memory/code_review_principles.md")` —— 15 条好代码原则, +**每条 finding 必须能映射到其中一条**。本轮 `/review` 只读取 `memory/review_sop/` +与 `memory/code_review_principles.md`,不引用其他工作流 prompt。 + +## 步骤 2:锁定审阅范围 + +按"本轮用户请求"的优先级解析: + +1. 用户明确点名文件 / 目录 → 审那些; +2. 用户描述任务范围 → 用 `code_run` 跑 `git status -s` / `git diff --stat HEAD` / + `git diff HEAD`,必要时看 `git log --oneline -5`; +3. 用户请求为空 / 模糊 → 默认审本次 uncommitted diff:跑 + `git diff --stat HEAD` 与 `git diff HEAD`; +4. git 失败且范围仍不可判定 → 告诉用户"请在下一句 `/review` 塞入文件路径或具体范围", + 本轮结束。 + +锁定范围后**不要 ask_user**;在最终报告的 Scope 中列清楚实际审了什么。 + +## 步骤 3:逐文件 file_read + +超过 800 行分段读。**优先看 diff 涉及的行**,再看上下文与接口调用方。 + +## 步骤 4:Q1-Q4 对抗性 framing(每问至少 1 条具体证据) + +- **Q1: Is this the right approach?** —— 有没有更简单 / 更标准 / 更安全的实现路径? + 当前路径依赖了哪些隐含假设? +- **Q2: What hidden dependencies could fail?** —— OS / shell / 网络 / 并发 / 用户输入 / + 第三方 API,任一项失效会怎样? +- **Q3: What edge / hostile input breaks it?** —— 空值、UTF-8 边界、Windows 路径、 + 超长字符串、并发写、过期 token、死代理。 +- **Q4: Is the failure mode observable & recoverable?** —— 仅看日志能不能定位故障? + 能不能不动手就恢复? + +## 步骤 5:列 P0-P3 findings + +按 §4 Severity / §5 防误报八规则 / §6 措辞八规范操作。**每条 finding 提交前过 §5 自检 +任一答 No → 删**;**每条 finding 措辞遵守 §6 八条**。 + +--- +# 3. Severity 定义(严格遵守,不要自创) + +| Level | 定义 | 例子 | +|---|---|---| +| **P0** | 阻塞:破坏正确性 / 丢数据 / 安全漏洞 / 不可逆故障 | 路径穿越未校验、SQL 注入、密钥落日志、并发竞态破坏数据、未捕获异常吃掉关键 finally | +| **P1** | 高危:契约破坏 / 用户可见错误,但不会立即崩 | 错误处理只 print 不抛、超时未设、配置写死、API schema 不一致 | +| **P2** | 维护性:可读性 / 命名 / 测试空缺,会增加未来 bug 概率但当前不破 | 函数 > 80 行、变量名歧义、注释与代码不符、duplicate logic、测试覆盖空缺 | +| **P3** | 风格 / 微优化 / 可选改进 | 命名小调整、常量提取、import 顺序 | + +--- +# 4. Verdict 决议(严格遵守) + +| 触发条件 | Verdict | +|---|---| +| 任一 P0 | **FAIL** | +| 无 P0,≥ 1 P1 | **CONDITIONAL** | +| 仅 P2/P3 或 0 finding | **PASS** | + +--- +# 5. 防误报八规则(按"成本从低到高"自查,任一答 No → 删 finding) + +1. **Discrete & actionable** —— 有具体可写的修复吗?"整体不够优雅"不算 finding; + 多个交织的小问题要拆开各自记录。 +2. **Introduced or exposed by this change** —— 是本次改动引入或放大的吗?祖传 bug 不要翻; + 预存 bug 被本次改动放大 → 显式标 `pre-existing, exposed by this change`。 +3. **Not an intentional design choice** —— 不要把作者的有意取舍当 bug:刻意保留的兼容层、 + 有意宽松的 try/except 兜底、风格选择 —— 这些不是 bug。 +4. **Provably affected, not speculated** —— 跨文件影响必须能指出**哪一段调用栈**会被破坏; + 纯臆想"这可能影响 X 模块"不写。 +5. **Evidence-anchored** —— 行号、代码片段、复现命令至少一项。"看起来"、"应该"、 + "或许"全删。 +6. **No unstated assumptions** —— 不要依赖未明说的"代码库应该这样"约定;如果 finding + 需要先假设作者意图才成立 → 删。 +7. **Author would likely fix if made aware** —— 作者看到会同意修吗?"100 万 QPS 才塌" + 这种极端假设不要塞 P1。 +8. **Impact meaningful + proportionate rigor** —— 影响必须涉及 accuracy / performance / + security / maintainability 之一;同时不要超出代码库本身的严谨度(一次性脚本仓库不要 + 求 PR 级注释和输入校验)。 + +--- +# 6. 措辞八规范(写每条 finding 时遵守) + +1. **Why-first** —— 第一句给原因,不绕弯。 +2. **严重度准确** —— 不要把 P2 写得像 P0;触发条件苛刻就在 impact 里立刻点出。 +3. **简洁** —— `evidence` / `impact` / `fix` 各 ≤ 1 段;除非代码片段需要换行,散文里不硬换行。 +4. **少贴大段代码** —— `evidence` 中代码 ≤ 5 行;超过用 `file:line-line` 引用,不要粘贴。 +5. **触发条件显式** —— `impact` 第一句就讲清在什么**场景 / 输入 / 环境**下出问题 + ("在 Windows 路径含中文时…"),不让读者自己脑补。 +6. **不卑不亢** —— 直陈事实,不带"显然""糟糕""太蠢"等情绪;也不带"非常感谢" + "做得很好"等开场白。 +7. **即读即懂** —— 核心结论放第一句;reading-twice 的 finding 重写。 +8. **零奉承** —— 不写 "Great work, but..."、"Thanks for the changes, however..."。 + +--- +# 7. 输出协议(整段 echo 到对话) ## Scope -列出本轮审阅的文件清单(一行一个,绝对路径或仓库相对路径)。 +列出本轮审阅的文件清单(一行一个,绝对路径或仓库相对路径)。 ## Verdict -PASS / CONDITIONAL / FAIL -> 决策规则:任一 P0 → FAIL;无 P0 但 ≥ 1 P1 → CONDITIONAL;仅 P2/P3 或 0 finding → PASS。 +PASS / CONDITIONAL / FAIL —— 按 §4 决议规则。 ## Summary -3-6 行散文:你看了什么、整体印象、最重要的 1-2 个风险。 +3-6 行散文:你看了什么、整体印象、最重要的 1-2 个风险。 ## Design Challenge (Q1-Q4) -- **Q1 是不是对的方法**:<证据> -- **Q2 隐藏依赖**:<证据> -- **Q3 边缘 / 敌意输入**:<证据> -- **Q4 故障可观测**:<证据> +- **Q1 是不是对的方法**:<证据> +- **Q2 隐藏依赖**:<证据> +- **Q3 边缘 / 敌意输入**:<证据> +- **Q4 故障可观测**:<证据> ## Findings(P0 → P3 顺序) -按下面格式列出每条: -- **[P0, conf=0.9] file:line-line** 标题(动词开头,≤ 80 字) - - **Evidence**:代码片段 ≤ 5 行 或 file:N-M 引用 - - **Impact**:触发场景 + 后果(第一句必带场景 / 输入 / 环境) - - **Fix**:可直接照做的修复思路(伪码或 patch),≤ 1 段 - - **Principle**:对应 code_review_principles 第 N 条 +按下面格式列出每条: +- **[P0, conf=0.9] file:line-line** 标题(动词开头,≤ 80 字,第一句给原因) + - **Evidence**:代码片段 ≤ 5 行 或 file:N-M 引用 + - **Impact**:触发场景 + 后果(第一句必带场景 / 输入 / 环境) + - **Fix**:可直接照做的修复思路(伪码或 patch),≤ 1 段 + - **Principle**:对应 code_review_principles 第 N 条 ## Cross-file notes 跨文件耦合 / 命名一致性 / 状态机 / 并发问题。无则 `(none)`。 ## Regression tests -3-5 条具体测试点(输入 / 预期 / 边界)。 - -# 自检 -- 每条 finding 通过防误报八规则与措辞规则 -- `confidence_score` 老实给:真 bug → ≥ 0.8;拿不准 → < 0.5 -- Verdict 与决策规则一致 -- 没有奉承 / 开场白 / 复述用户目标 / 承诺修复 +3-5 条具体测试点(输入 / 预期 / 边界)。 + +--- +# 8. 自检(提交前最后过一遍) + +- [ ] `code_review_principles.md` 已 file_read +- [ ] 每个待审文件都至少 file_read 一次 +- [ ] `Design Challenge` 4 个字段都有具体证据,不是空话 +- [ ] 每条 finding 通过 §5 防误报八规则(discrete / introduced / not-intentional / + provably-affected / evidence-anchored / no-unstated-assumptions / would-fix / + impact-meaningful) +- [ ] 每条 finding 通过 §6 措辞八规范(why-first / accurate / brief / no-big-code / + scenario-explicit / matter-of-fact / immediately-graspable / no-flattery) +- [ ] `confidence_score` 老实给:真 bug → ≥ 0.8;拿不准 → < 0.5 +- [ ] Verdict 与 §4 决议规则一致 +- [ ] 没有奉承 / 开场白 / 复述用户目标 / 承诺修复