Skip to content

Commit 3733151

Browse files
authored
Merge pull request #15 from Philipp0205/phkurrle-photo-support
feat: add photo support
2 parents 90c4a6f + 554473d commit 3733151

11 files changed

Lines changed: 520 additions & 6 deletions

File tree

PRODUCT.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ Open tasks for upcoming iterations:
139139
- [ ] Add server crash notifications in Telegram
140140
- [ ] Add periodic health checks and optional auto-restart for OpenCode server
141141
- [ ] Improve Telegram-compatible message formatting for richer outputs
142-
- [ ] Support sending files from Telegram to OpenCode (screenshots, documents)
142+
- [x] Support sending photos from Telegram to OpenCode (screenshots, images)
143143
- [ ] Provide a Docker image and basic container deployment guide
144144
- [x] Add voice transcription
145145

src/bot/handlers/prompt.ts

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { Bot, Context } from "grammy";
2+
import type { FilePartInput, TextPartInput } from "@opencode-ai/sdk/v2";
23
import { opencodeClient } from "../../opencode/client.js";
34
import { clearSession, getCurrentSession, setCurrentSession } from "../../session/manager.js";
45
import { ingestSessionInfoForCache } from "../../session/cache-manager.js";
@@ -77,14 +78,19 @@ export interface ProcessPromptDeps {
7778

7879
/**
7980
* Processes a user prompt: ensures project/session, subscribes to events, and sends
80-
* the prompt to OpenCode. Used by both text and voice message handlers.
81+
* the prompt to OpenCode. Used by text, voice, and photo message handlers.
8182
*
83+
* @param ctx - Grammy context
84+
* @param text - Text content of the prompt
85+
* @param deps - Dependencies (bot and event subscription)
86+
* @param fileParts - Optional file parts (for photo/document attachments)
8287
* @returns true if the prompt was dispatched, false if it was blocked/failed early.
8388
*/
8489
export async function processUserPrompt(
8590
ctx: Context,
8691
text: string,
8792
deps: ProcessPromptDeps,
93+
fileParts: FilePartInput[] = [],
8894
): Promise<boolean> {
8995
const { bot, ensureEventSubscription } = deps;
9096

@@ -193,17 +199,36 @@ export async function processUserPrompt(
193199
const currentAgent = getStoredAgent();
194200
const storedModel = getStoredModel();
195201

202+
// Build parts array with text and files
203+
const parts: Array<TextPartInput | FilePartInput> = [];
204+
205+
// Add text part if present
206+
if (text.trim().length > 0) {
207+
parts.push({ type: "text", text });
208+
}
209+
210+
// Add file parts
211+
parts.push(...fileParts);
212+
213+
// If no text and files exist, use a placeholder
214+
if (parts.length === 0 || (parts.length > 0 && parts.every((p) => p.type === "file"))) {
215+
if (fileParts.length > 0) {
216+
// Files without text - add a minimal system prompt
217+
parts.unshift({ type: "text", text: "See attached file" });
218+
}
219+
}
220+
196221
const promptOptions: {
197222
sessionID: string;
198223
directory: string;
199-
parts: Array<{ type: "text"; text: string }>;
224+
parts: Array<TextPartInput | FilePartInput>;
200225
model?: { providerID: string; modelID: string };
201226
agent?: string;
202227
variant?: string;
203228
} = {
204229
sessionID: currentSession.id,
205230
directory: currentSession.directory,
206-
parts: [{ type: "text", text }],
231+
parts,
207232
agent: currentAgent,
208233
};
209234

@@ -228,9 +253,12 @@ export async function processUserPrompt(
228253
modelId: storedModel.modelID || "default",
229254
variant: storedModel.variant || "default",
230255
promptLength: text.length,
256+
fileCount: fileParts.length,
231257
};
232258

233-
logger.info(`[Bot] Calling session.prompt (fire-and-forget) with agent=${currentAgent}...`);
259+
logger.info(
260+
`[Bot] Calling session.prompt (fire-and-forget) with agent=${currentAgent}, fileCount=${fileParts.length}...`,
261+
);
234262

235263
// CRITICAL: DO NOT wait for session.prompt to complete.
236264
// If we wait, the handler will not finish and grammY will not call getUpdates,

src/bot/index.ts

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ import { pinnedMessageManager } from "../pinned/manager.js";
4949
import { t } from "../i18n/index.js";
5050
import { processUserPrompt } from "./handlers/prompt.js";
5151
import { handleVoiceMessage } from "./handlers/voice.js";
52+
import { downloadTelegramFile, toDataUri } from "./utils/file-download.js";
53+
import { getModelCapabilities, supportsInput } from "../model/capabilities.js";
54+
import { getStoredModel } from "../model/manager.js";
55+
import type { FilePartInput } from "@opencode-ai/sdk/v2";
5256

5357
let botInstance: Bot<Context> | null = null;
5458
let chatIdInstance: number | null = null;
@@ -695,6 +699,70 @@ export function createBot(): Bot<Context> {
695699
await handleVoiceMessage(ctx, voicePromptDeps);
696700
});
697701

702+
// Photo message handler
703+
bot.on("message:photo", async (ctx) => {
704+
logger.debug(`[Bot] Received photo message, chatId=${ctx.chat.id}`);
705+
706+
const photos = ctx.message?.photo;
707+
if (!photos || photos.length === 0) {
708+
return;
709+
}
710+
711+
const caption = ctx.message.caption || "";
712+
713+
try {
714+
// Get the largest photo (last element in array)
715+
const largestPhoto = photos[photos.length - 1];
716+
717+
// Check model capabilities
718+
const storedModel = getStoredModel();
719+
const capabilities = await getModelCapabilities(storedModel.providerID, storedModel.modelID);
720+
721+
if (!supportsInput(capabilities, "image")) {
722+
logger.warn(
723+
`[Bot] Model ${storedModel.providerID}/${storedModel.modelID} doesn't support image input`,
724+
);
725+
await ctx.reply(t("bot.photo_model_no_image"));
726+
727+
// Fall back to caption-only if present
728+
if (caption.trim().length > 0) {
729+
botInstance = bot;
730+
chatIdInstance = ctx.chat.id;
731+
const promptDeps = { bot, ensureEventSubscription };
732+
await processUserPrompt(ctx, caption, promptDeps);
733+
}
734+
return;
735+
}
736+
737+
// Download photo
738+
await ctx.reply(t("bot.photo_downloading"));
739+
const downloadedFile = await downloadTelegramFile(ctx.api, largestPhoto.file_id);
740+
741+
// Convert to data URI (Telegram always converts photos to JPEG)
742+
const dataUri = toDataUri(downloadedFile.buffer, "image/jpeg");
743+
744+
// Create file part
745+
const filePart: FilePartInput = {
746+
type: "file",
747+
mime: "image/jpeg",
748+
filename: "photo.jpg",
749+
url: dataUri,
750+
};
751+
752+
logger.info(`[Bot] Sending photo (${downloadedFile.buffer.length} bytes) with prompt`);
753+
754+
botInstance = bot;
755+
chatIdInstance = ctx.chat.id;
756+
757+
// Send via processUserPrompt with file part
758+
const promptDeps = { bot, ensureEventSubscription };
759+
await processUserPrompt(ctx, caption, promptDeps, [filePart]);
760+
} catch (err) {
761+
logger.error("[Bot] Error handling photo message:", err);
762+
await ctx.reply(t("bot.photo_download_error"));
763+
}
764+
});
765+
698766
bot.on("message:text", async (ctx) => {
699767
const text = ctx.message?.text;
700768
if (!text) {

src/bot/utils/file-download.ts

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import type { Api } from "grammy";
2+
import { config } from "../../config.js";
3+
import { logger } from "../../utils/logger.js";
4+
5+
const TELEGRAM_FILE_URL_BASE = "https://api.telegram.org/file/bot";
6+
const MAX_FILE_SIZE_BYTES = 20 * 1024 * 1024; // 20MB Telegram limit
7+
8+
export interface DownloadedFile {
9+
buffer: Buffer;
10+
filePath: string;
11+
mimeType?: string;
12+
}
13+
14+
/**
15+
* Download a photo from Telegram servers
16+
* @param api Grammy API instance
17+
* @param fileId Telegram file_id
18+
* @returns Downloaded photo buffer and path
19+
*/
20+
export async function downloadTelegramFile(api: Api, fileId: string): Promise<DownloadedFile> {
21+
logger.debug(`[FileDownload] Getting file info for fileId=${fileId}`);
22+
23+
const file = await api.getFile(fileId);
24+
25+
if (!file.file_path) {
26+
throw new Error("File path not available from Telegram");
27+
}
28+
29+
if (file.file_size && file.file_size > MAX_FILE_SIZE_BYTES) {
30+
const sizeMb = (file.file_size / (1024 * 1024)).toFixed(2);
31+
throw new Error(`File too large: ${sizeMb}MB (max 20MB)`);
32+
}
33+
34+
const fileUrl = `${TELEGRAM_FILE_URL_BASE}${config.telegram.token}/${file.file_path}`;
35+
logger.debug(`[FileDownload] Downloading from ${fileUrl.replace(config.telegram.token, "***")}`);
36+
37+
const fetchOptions: RequestInit & { agent?: unknown } = {};
38+
39+
// Use proxy if configured
40+
if (config.telegram.proxyUrl) {
41+
const { HttpsProxyAgent } = await import("https-proxy-agent");
42+
fetchOptions.agent = new HttpsProxyAgent(config.telegram.proxyUrl);
43+
}
44+
45+
const response = await fetch(fileUrl, fetchOptions);
46+
47+
if (!response.ok) {
48+
throw new Error(`Failed to download file: ${response.status} ${response.statusText}`);
49+
}
50+
51+
const arrayBuffer = await response.arrayBuffer();
52+
const buffer = Buffer.from(arrayBuffer);
53+
54+
logger.debug(`[FileDownload] Downloaded ${buffer.length} bytes`);
55+
56+
return {
57+
buffer,
58+
filePath: file.file_path,
59+
};
60+
}
61+
62+
/**
63+
* Convert buffer to base64 data URI
64+
* @param buffer File buffer
65+
* @param mimeType MIME type (e.g., "image/jpeg")
66+
* @returns Data URI string
67+
*/
68+
export function toDataUri(buffer: Buffer, mimeType: string): string {
69+
const base64 = buffer.toString("base64");
70+
return `data:${mimeType};base64,${base64}`;
71+
}
72+
73+
/**
74+
* Check if photo size is within limits
75+
* @param fileSize Photo size in bytes
76+
* @param maxSizeKb Maximum size in KB (from config)
77+
* @returns true if within limit
78+
*/
79+
export function isFileSizeAllowed(fileSize: number | undefined, maxSizeKb: number): boolean {
80+
if (!fileSize) {
81+
return true; // Unknown size, allow (will be checked on download)
82+
}
83+
84+
const maxBytes = maxSizeKb * 1024;
85+
return fileSize <= maxBytes;
86+
}
87+
88+
/**
89+
* Get human-readable photo size
90+
*/
91+
export function formatFileSize(bytes: number): string {
92+
if (bytes < 1024) {
93+
return `${bytes}B`;
94+
}
95+
if (bytes < 1024 * 1024) {
96+
return `${(bytes / 1024).toFixed(1)}KB`;
97+
}
98+
return `${(bytes / (1024 * 1024)).toFixed(1)}MB`;
99+
}

src/i18n/en.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,11 @@ export const en = {
6464
"bot.prompt_send_error": "Failed to send request to OpenCode.",
6565
"bot.session_error": "🔴 OpenCode returned an error: {message}",
6666
"bot.unknown_command": "⚠️ Unknown command: {command}. Use /help to see available commands.",
67+
"bot.photo_downloading": "⏳ Downloading photo...",
68+
"bot.photo_too_large": "⚠️ Photo is too large (max {maxSizeMb}MB)",
69+
"bot.photo_model_no_image": "⚠️ Current model doesn't support image input. Sending text only.",
70+
"bot.photo_download_error": "🔴 Failed to download photo",
71+
"bot.photo_no_caption": "💡 Tip: Add a caption to describe what you want to do with this photo.",
6772

6873
"status.header_running": "🟢 **OpenCode Server is running**",
6974
"status.health.healthy": "Healthy",

src/i18n/ru.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,12 @@ export const ru: I18nDictionary = {
6363
"bot.prompt_send_error": "Не удалось отправить запрос в OpenCode.",
6464
"bot.session_error": "🔴 OpenCode вернул ошибку: {message}",
6565
"bot.unknown_command": "⚠️ Неизвестная команда: {command}. Используйте /help для списка команд.",
66+
"bot.photo_downloading": "⏳ Скачиваю фото...",
67+
"bot.photo_too_large": "⚠️ Фото слишком большое (макс. {maxSizeMb}МБ)",
68+
"bot.photo_model_no_image":
69+
"⚠️ Текущая модель не поддерживает изображения. Отправляю только текст.",
70+
"bot.photo_download_error": "🔴 Не удалось скачать фото",
71+
"bot.photo_no_caption": "💡 Совет: Добавьте подпись, чтобы описать, что делать с этим фото.",
6672

6773
"status.header_running": "🟢 **OpenCode Server запущен**",
6874
"status.health.healthy": "Healthy",

src/interaction/guard.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ function classifyIncomingInput(ctx: Context): {
4242
return { inputType: "text" };
4343
}
4444

45+
// Photo, voice, audio, and other non-text messages are classified as "other"
46+
if (ctx.message?.photo) {
47+
return { inputType: "other" };
48+
}
49+
4550
return { inputType: "other" };
4651
}
4752

src/model/capabilities.ts

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import { opencodeClient } from "../opencode/client.js";
2+
import { logger } from "../utils/logger.js";
3+
import type { Model } from "@opencode-ai/sdk/v2";
4+
5+
interface ModelCapabilitiesCache {
6+
[key: string]: Model["capabilities"] | null;
7+
}
8+
9+
const capabilitiesCache: ModelCapabilitiesCache = {};
10+
11+
/**
12+
* Get model capabilities from OpenCode API
13+
* Results are cached in memory per model
14+
*/
15+
export async function getModelCapabilities(
16+
providerID: string,
17+
modelID: string,
18+
): Promise<Model["capabilities"] | null> {
19+
const cacheKey = `${providerID}/${modelID}`;
20+
21+
if (capabilitiesCache[cacheKey] !== undefined) {
22+
logger.debug(`[ModelCapabilities] Cache hit for ${cacheKey}`);
23+
return capabilitiesCache[cacheKey];
24+
}
25+
26+
try {
27+
logger.debug(`[ModelCapabilities] Fetching capabilities for ${cacheKey}`);
28+
const response = await opencodeClient.config.providers();
29+
30+
if (response.error || !response.data) {
31+
logger.error("[ModelCapabilities] API returned error:", response.error);
32+
capabilitiesCache[cacheKey] = null;
33+
return null;
34+
}
35+
36+
const providers = response.data.providers;
37+
const provider = providers.find((p) => p.id === providerID);
38+
39+
if (!provider) {
40+
logger.warn(`[ModelCapabilities] Provider ${providerID} not found`);
41+
capabilitiesCache[cacheKey] = null;
42+
return null;
43+
}
44+
45+
const model = provider.models[modelID];
46+
47+
if (!model) {
48+
logger.warn(`[ModelCapabilities] Model ${cacheKey} not found in provider`);
49+
capabilitiesCache[cacheKey] = null;
50+
return null;
51+
}
52+
53+
logger.debug(`[ModelCapabilities] Found capabilities for ${cacheKey}`);
54+
capabilitiesCache[cacheKey] = model.capabilities;
55+
return model.capabilities;
56+
} catch (error) {
57+
logger.error("[ModelCapabilities] Failed to fetch providers:", error);
58+
capabilitiesCache[cacheKey] = null;
59+
return null;
60+
}
61+
}
62+
63+
/**
64+
* Check if model supports a specific input type
65+
*/
66+
export function supportsInput(
67+
capabilities: Model["capabilities"] | null,
68+
inputType: "image" | "pdf" | "audio" | "video",
69+
): boolean {
70+
if (!capabilities) {
71+
return false;
72+
}
73+
74+
return capabilities.input[inputType] === true;
75+
}
76+
77+
/**
78+
* Check if model supports attachments in general
79+
*/
80+
export function supportsAttachment(capabilities: Model["capabilities"] | null): boolean {
81+
if (!capabilities) {
82+
return false;
83+
}
84+
85+
return capabilities.attachment === true;
86+
}

0 commit comments

Comments
 (0)