Merge pull request #15 from Philipp0205/phkurrle-photo-support

grinev · web-flow · commit 373315135a4e · 2026-02-26T01:10:59.000+03:00
feat: add photo support
diff --git a/PRODUCT.md b/PRODUCT.md
@@ -139,7 +139,7 @@ Open tasks for upcoming iterations:
 - [ ] Add server crash notifications in Telegram
 - [ ] Add periodic health checks and optional auto-restart for OpenCode server
 - [ ] Improve Telegram-compatible message formatting for richer outputs
-- [ ] Support sending files from Telegram to OpenCode (screenshots, documents)
+- [x] Support sending photos from Telegram to OpenCode (screenshots, images)
 - [ ] Provide a Docker image and basic container deployment guide
 - [x] Add voice transcription
 
diff --git a/src/bot/handlers/prompt.ts b/src/bot/handlers/prompt.ts
@@ -1,4 +1,5 @@
 import { Bot, Context } from "grammy";
+import type { FilePartInput, TextPartInput } from "@opencode-ai/sdk/v2";
 import { opencodeClient } from "../../opencode/client.js";
 import { clearSession, getCurrentSession, setCurrentSession } from "../../session/manager.js";
 import { ingestSessionInfoForCache } from "../../session/cache-manager.js";
@@ -77,14 +78,19 @@ export interface ProcessPromptDeps {
 
 /**
  * Processes a user prompt: ensures project/session, subscribes to events, and sends
- * the prompt to OpenCode. Used by both text and voice message handlers.
+ * the prompt to OpenCode. Used by text, voice, and photo message handlers.
  *
+ * @param ctx - Grammy context
+ * @param text - Text content of the prompt
+ * @param deps - Dependencies (bot and event subscription)
+ * @param fileParts - Optional file parts (for photo/document attachments)
  * @returns true if the prompt was dispatched, false if it was blocked/failed early.
  */
 export async function processUserPrompt(
   ctx: Context,
   text: string,
   deps: ProcessPromptDeps,
+  fileParts: FilePartInput[] = [],
 ): Promise<boolean> {
   const { bot, ensureEventSubscription } = deps;
 
@@ -193,17 +199,36 @@ export async function processUserPrompt(
     const currentAgent = getStoredAgent();
     const storedModel = getStoredModel();
 
+    // Build parts array with text and files
+    const parts: Array<TextPartInput | FilePartInput> = [];
+
+    // Add text part if present
+    if (text.trim().length > 0) {
+      parts.push({ type: "text", text });
+    }
+
+    // Add file parts
+    parts.push(...fileParts);
+
+    // If no text and files exist, use a placeholder
+    if (parts.length === 0 || (parts.length > 0 && parts.every((p) => p.type === "file"))) {
+      if (fileParts.length > 0) {
+        // Files without text - add a minimal system prompt
+        parts.unshift({ type: "text", text: "See attached file" });
+      }
+    }
+
     const promptOptions: {
       sessionID: string;
       directory: string;
-      parts: Array<{ type: "text"; text: string }>;
+      parts: Array<TextPartInput | FilePartInput>;
       model?: { providerID: string; modelID: string };
       agent?: string;
       variant?: string;
     } = {
       sessionID: currentSession.id,
       directory: currentSession.directory,
-      parts: [{ type: "text", text }],
+      parts,
       agent: currentAgent,
     };
 
@@ -228,9 +253,12 @@ export async function processUserPrompt(
       modelId: storedModel.modelID || "default",
       variant: storedModel.variant || "default",
       promptLength: text.length,
+      fileCount: fileParts.length,
     };
 
-    logger.info(`[Bot] Calling session.prompt (fire-and-forget) with agent=${currentAgent}...`);
+    logger.info(
+      `[Bot] Calling session.prompt (fire-and-forget) with agent=${currentAgent}, fileCount=${fileParts.length}...`,
+    );
 
     // CRITICAL: DO NOT wait for session.prompt to complete.
     // If we wait, the handler will not finish and grammY will not call getUpdates,
diff --git a/src/bot/index.ts b/src/bot/index.ts
@@ -49,6 +49,10 @@ import { pinnedMessageManager } from "../pinned/manager.js";
 import { t } from "../i18n/index.js";
 import { processUserPrompt } from "./handlers/prompt.js";
 import { handleVoiceMessage } from "./handlers/voice.js";
+import { downloadTelegramFile, toDataUri } from "./utils/file-download.js";
+import { getModelCapabilities, supportsInput } from "../model/capabilities.js";
+import { getStoredModel } from "../model/manager.js";
+import type { FilePartInput } from "@opencode-ai/sdk/v2";
 
 let botInstance: Bot<Context> | null = null;
 let chatIdInstance: number | null = null;
@@ -695,6 +699,70 @@ export function createBot(): Bot<Context> {
     await handleVoiceMessage(ctx, voicePromptDeps);
   });
 
+  // Photo message handler
+  bot.on("message:photo", async (ctx) => {
+    logger.debug(`[Bot] Received photo message, chatId=${ctx.chat.id}`);
+
+    const photos = ctx.message?.photo;
+    if (!photos || photos.length === 0) {
+      return;
+    }
+
+    const caption = ctx.message.caption || "";
+
+    try {
+      // Get the largest photo (last element in array)
+      const largestPhoto = photos[photos.length - 1];
+
+      // Check model capabilities
+      const storedModel = getStoredModel();
+      const capabilities = await getModelCapabilities(storedModel.providerID, storedModel.modelID);
+
+      if (!supportsInput(capabilities, "image")) {
+        logger.warn(
+          `[Bot] Model ${storedModel.providerID}/${storedModel.modelID} doesn't support image input`,
+        );
+        await ctx.reply(t("bot.photo_model_no_image"));
+
+        // Fall back to caption-only if present
+        if (caption.trim().length > 0) {
+          botInstance = bot;
+          chatIdInstance = ctx.chat.id;
+          const promptDeps = { bot, ensureEventSubscription };
+          await processUserPrompt(ctx, caption, promptDeps);
+        }
+        return;
+      }
+
+      // Download photo
+      await ctx.reply(t("bot.photo_downloading"));
+      const downloadedFile = await downloadTelegramFile(ctx.api, largestPhoto.file_id);
+
+      // Convert to data URI (Telegram always converts photos to JPEG)
+      const dataUri = toDataUri(downloadedFile.buffer, "image/jpeg");
+
+      // Create file part
+      const filePart: FilePartInput = {
+        type: "file",
+        mime: "image/jpeg",
+        filename: "photo.jpg",
+        url: dataUri,
+      };
+
+      logger.info(`[Bot] Sending photo (${downloadedFile.buffer.length} bytes) with prompt`);
+
+      botInstance = bot;
+      chatIdInstance = ctx.chat.id;
+
+      // Send via processUserPrompt with file part
+      const promptDeps = { bot, ensureEventSubscription };
+      await processUserPrompt(ctx, caption, promptDeps, [filePart]);
+    } catch (err) {
+      logger.error("[Bot] Error handling photo message:", err);
+      await ctx.reply(t("bot.photo_download_error"));
+    }
+  });
+
   bot.on("message:text", async (ctx) => {
     const text = ctx.message?.text;
     if (!text) {
diff --git a/src/bot/utils/file-download.ts b/src/bot/utils/file-download.ts
@@ -0,0 +1,99 @@
+import type { Api } from "grammy";
+import { config } from "../../config.js";
+import { logger } from "../../utils/logger.js";
+
+const TELEGRAM_FILE_URL_BASE = "https://api.telegram.org/file/bot";
+const MAX_FILE_SIZE_BYTES = 20 * 1024 * 1024; // 20MB Telegram limit
+
+export interface DownloadedFile {
+  buffer: Buffer;
+  filePath: string;
+  mimeType?: string;
+}
+
+/**
+ * Download a photo from Telegram servers
+ * @param api Grammy API instance
+ * @param fileId Telegram file_id
+ * @returns Downloaded photo buffer and path
+ */
+export async function downloadTelegramFile(api: Api, fileId: string): Promise<DownloadedFile> {
+  logger.debug(`[FileDownload] Getting file info for fileId=${fileId}`);
+
+  const file = await api.getFile(fileId);
+
+  if (!file.file_path) {
+    throw new Error("File path not available from Telegram");
+  }
+
+  if (file.file_size && file.file_size > MAX_FILE_SIZE_BYTES) {
+    const sizeMb = (file.file_size / (1024 * 1024)).toFixed(2);
+    throw new Error(`File too large: ${sizeMb}MB (max 20MB)`);
+  }
+
+  const fileUrl = `${TELEGRAM_FILE_URL_BASE}${config.telegram.token}/${file.file_path}`;
+  logger.debug(`[FileDownload] Downloading from ${fileUrl.replace(config.telegram.token, "***")}`);
+
+  const fetchOptions: RequestInit & { agent?: unknown } = {};
+
+  // Use proxy if configured
+  if (config.telegram.proxyUrl) {
+    const { HttpsProxyAgent } = await import("https-proxy-agent");
+    fetchOptions.agent = new HttpsProxyAgent(config.telegram.proxyUrl);
+  }
+
+  const response = await fetch(fileUrl, fetchOptions);
+
+  if (!response.ok) {
+    throw new Error(`Failed to download file: ${response.status} ${response.statusText}`);
+  }
+
+  const arrayBuffer = await response.arrayBuffer();
+  const buffer = Buffer.from(arrayBuffer);
+
+  logger.debug(`[FileDownload] Downloaded ${buffer.length} bytes`);
+
+  return {
+    buffer,
+    filePath: file.file_path,
+  };
+}
+
+/**
+ * Convert buffer to base64 data URI
+ * @param buffer File buffer
+ * @param mimeType MIME type (e.g., "image/jpeg")
+ * @returns Data URI string
+ */
+export function toDataUri(buffer: Buffer, mimeType: string): string {
+  const base64 = buffer.toString("base64");
+  return `data:${mimeType};base64,${base64}`;
+}
+
+/**
+ * Check if photo size is within limits
+ * @param fileSize Photo size in bytes
+ * @param maxSizeKb Maximum size in KB (from config)
+ * @returns true if within limit
+ */
+export function isFileSizeAllowed(fileSize: number | undefined, maxSizeKb: number): boolean {
+  if (!fileSize) {
+    return true; // Unknown size, allow (will be checked on download)
+  }
+
+  const maxBytes = maxSizeKb * 1024;
+  return fileSize <= maxBytes;
+}
+
+/**
+ * Get human-readable photo size
+ */
+export function formatFileSize(bytes: number): string {
+  if (bytes < 1024) {
+    return `${bytes}B`;
+  }
+  if (bytes < 1024 * 1024) {
+    return `${(bytes / 1024).toFixed(1)}KB`;
+  }
+  return `${(bytes / (1024 * 1024)).toFixed(1)}MB`;
+}
diff --git a/src/i18n/en.ts b/src/i18n/en.ts
@@ -64,6 +64,11 @@ export const en = {
   "bot.prompt_send_error": "Failed to send request to OpenCode.",
   "bot.session_error": "🔴 OpenCode returned an error: {message}",
   "bot.unknown_command": "⚠️ Unknown command: {command}. Use /help to see available commands.",
+  "bot.photo_downloading": "⏳ Downloading photo...",
+  "bot.photo_too_large": "⚠️ Photo is too large (max {maxSizeMb}MB)",
+  "bot.photo_model_no_image": "⚠️ Current model doesn't support image input. Sending text only.",
+  "bot.photo_download_error": "🔴 Failed to download photo",
+  "bot.photo_no_caption": "💡 Tip: Add a caption to describe what you want to do with this photo.",
 
   "status.header_running": "🟢 **OpenCode Server is running**",
   "status.health.healthy": "Healthy",
diff --git a/src/i18n/ru.ts b/src/i18n/ru.ts
@@ -63,6 +63,12 @@ export const ru: I18nDictionary = {
   "bot.prompt_send_error": "Не удалось отправить запрос в OpenCode.",
   "bot.session_error": "🔴 OpenCode вернул ошибку: {message}",
   "bot.unknown_command": "⚠️ Неизвестная команда: {command}. Используйте /help для списка команд.",
+  "bot.photo_downloading": "⏳ Скачиваю фото...",
+  "bot.photo_too_large": "⚠️ Фото слишком большое (макс. {maxSizeMb}МБ)",
+  "bot.photo_model_no_image":
+    "⚠️ Текущая модель не поддерживает изображения. Отправляю только текст.",
+  "bot.photo_download_error": "🔴 Не удалось скачать фото",
+  "bot.photo_no_caption": "💡 Совет: Добавьте подпись, чтобы описать, что делать с этим фото.",
 
   "status.header_running": "🟢 **OpenCode Server запущен**",
   "status.health.healthy": "Healthy",
diff --git a/src/interaction/guard.ts b/src/interaction/guard.ts
@@ -42,6 +42,11 @@ function classifyIncomingInput(ctx: Context): {
     return { inputType: "text" };
   }
 
+  // Photo, voice, audio, and other non-text messages are classified as "other"
+  if (ctx.message?.photo) {
+    return { inputType: "other" };
+  }
+
   return { inputType: "other" };
 }
 
diff --git a/src/model/capabilities.ts b/src/model/capabilities.ts
@@ -0,0 +1,86 @@
+import { opencodeClient } from "../opencode/client.js";
+import { logger } from "../utils/logger.js";
+import type { Model } from "@opencode-ai/sdk/v2";
+
+interface ModelCapabilitiesCache {
+  [key: string]: Model["capabilities"] | null;
+}
+
+const capabilitiesCache: ModelCapabilitiesCache = {};
+
+/**
+ * Get model capabilities from OpenCode API
+ * Results are cached in memory per model
+ */
+export async function getModelCapabilities(
+  providerID: string,
+  modelID: string,
+): Promise<Model["capabilities"] | null> {
+  const cacheKey = `${providerID}/${modelID}`;
+
+  if (capabilitiesCache[cacheKey] !== undefined) {
+    logger.debug(`[ModelCapabilities] Cache hit for ${cacheKey}`);
+    return capabilitiesCache[cacheKey];
+  }
+
+  try {
+    logger.debug(`[ModelCapabilities] Fetching capabilities for ${cacheKey}`);
+    const response = await opencodeClient.config.providers();
+
+    if (response.error || !response.data) {
+      logger.error("[ModelCapabilities] API returned error:", response.error);
+      capabilitiesCache[cacheKey] = null;
+      return null;
+    }
+
+    const providers = response.data.providers;
+    const provider = providers.find((p) => p.id === providerID);
+
+    if (!provider) {
+      logger.warn(`[ModelCapabilities] Provider ${providerID} not found`);
+      capabilitiesCache[cacheKey] = null;
+      return null;
+    }
+
+    const model = provider.models[modelID];
+
+    if (!model) {
+      logger.warn(`[ModelCapabilities] Model ${cacheKey} not found in provider`);
+      capabilitiesCache[cacheKey] = null;
+      return null;
+    }
+
+    logger.debug(`[ModelCapabilities] Found capabilities for ${cacheKey}`);
+    capabilitiesCache[cacheKey] = model.capabilities;
+    return model.capabilities;
+  } catch (error) {
+    logger.error("[ModelCapabilities] Failed to fetch providers:", error);
+    capabilitiesCache[cacheKey] = null;
+    return null;
+  }
+}
+
+/**
+ * Check if model supports a specific input type
+ */
+export function supportsInput(
+  capabilities: Model["capabilities"] | null,
+  inputType: "image" | "pdf" | "audio" | "video",
+): boolean {
+  if (!capabilities) {
+    return false;
+  }
+
+  return capabilities.input[inputType] === true;
+}
+
+/**
+ * Check if model supports attachments in general
+ */
+export function supportsAttachment(capabilities: Model["capabilities"] | null): boolean {
+  if (!capabilities) {
+    return false;
+  }
+
+  return capabilities.attachment === true;
+}
diff --git a/tests/bot/utils/file-download.test.ts b/tests/bot/utils/file-download.test.ts
diff --git a/tests/interaction/guard.test.ts b/tests/interaction/guard.test.ts
diff --git a/tests/model/capabilities.test.ts b/tests/model/capabilities.test.ts

Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,11 @@ function classifyIncomingInput(ctx: Context): {`
`42`	`42`	`return { inputType: "text" };`
`43`	`43`	`}`
`44`	`44`
	`45`	`+ // Photo, voice, audio, and other non-text messages are classified as "other"`
	`46`	`+ if (ctx.message?.photo) {`
	`47`	`+ return { inputType: "other" };`
	`48`	`+ }`
	`49`	`+`
`45`	`50`	`return { inputType: "other" };`
`46`	`51`	`}`
`47`	`52`