import { generateObject } from 'ai'; import { createOpenAI } from '@ai-sdk/openai'; import { z } from 'zod'; import type { ExtractedFrame } from './frame-extractor.ts'; import type { ProductFrame } from './types.ts'; import { imageToBase64 } from './frame-extractor.ts'; // Pass 1: quick filter — discard frames that clearly have no product const FilterSchema = z.object({ keep: z.boolean(), reason: z.enum(['product_visible', 'content_only', 'hands_only', 'blur', 'transition', 'background_only']), }); // Pass 2: comparative ranking across all candidates const RankingSchema = z.object({ bestFrameIndex: z.number().int(), description: z.string(), reasoning: z.string(), // normalized 0-1 relative to image dimensions: [x1, y1, x2, y2] boundingBox: z.tuple([z.number(), z.number(), z.number(), z.number()]), }); const FILTER_PROMPT = `You are filtering frames from a TikTok/Douyin ecommerce product video. Keep a frame (keep=true) ONLY if the HERO PRODUCT (the main item being sold) is at least partially visible as a recognizable object. Discard (keep=false) if: only hands/texture/contents visible, motion blur, black/transition frame, or no product at all. reason options: product_visible | content_only | hands_only | blur | transition | background_only`; const RANKING_PROMPT = (count: number) => `You are selecting the single best product image from ${count} video frames for ecommerce image search. The frames are numbered 0 to ${count - 1} in the order shown. Pick the ONE frame where the HERO PRODUCT is: 1. Cleanest — fewest distractions, no hands blocking it, no clutter in foreground 2. Most complete — full product silhouette visible, no edges cropped 3. Most isolated — product stands out from background clearly 4. Empty/minimal load preferred — a product without contents (e.g. an empty rack) beats one stuffed with items if both show the full structure equally Return: - bestFrameIndex: 0-based index of chosen frame - description: concise search query under 12 words (product type + material + color + key feature) - reasoning: one sentence explaining why this frame was chosen - boundingBox: tight bounding box of the HERO PRODUCT ONLY in the chosen frame as [x1, y1, x2, y2] normalized 0.0–1.0 (top-left origin). Exclude hands, background, and unrelated objects. The product is assumed to be near the center.`; function createVisionModel() { const apiKey = process.env.VISION_API_KEY; if (!apiKey) throw new Error('VISION_API_KEY not set'); const provider = createOpenAI({ apiKey, baseURL: process.env.VISION_API_BASE, }); return provider(process.env.VISION_MODEL ?? 'gpt-4o-mini'); } async function filterFrame( frame: ExtractedFrame, model: ReturnType>, ): Promise { const base64Image = imageToBase64(frame.imagePath); const { object } = await generateObject({ model, schema: FilterSchema, messages: [{ role: 'user', content: [ { type: 'image', image: `data:image/jpeg;base64,${base64Image}` }, { type: 'text', text: FILTER_PROMPT }, ], }], }); return object.keep; } async function rankCandidates( candidates: ExtractedFrame[], model: ReturnType>, ): Promise<{ bestFrame: ExtractedFrame; description: string; reasoning: string; boundingBox: [number, number, number, number] }> { const imageContent = candidates.map((f) => ({ type: 'image' as const, image: `data:image/jpeg;base64,${imageToBase64(f.imagePath)}`, })); const { object } = await generateObject({ model, schema: RankingSchema, mode: 'json', messages: [{ role: 'user', content: [ ...imageContent, { type: 'text', text: RANKING_PROMPT(candidates.length) }, ], }], }); const idx = Math.max(0, Math.min(object.bestFrameIndex, candidates.length - 1)); return { bestFrame: candidates[idx], description: object.description, reasoning: object.reasoning, boundingBox: object.boundingBox, }; } export async function cropProduct( imagePath: string, boundingBox: [number, number, number, number], outputPath: string, paddingFactor = 0.05, ): Promise { const sharp = (await import('sharp')).default; const meta = await sharp(imagePath).metadata(); const W = meta.width!; const H = meta.height!; let [x1, y1, x2, y2] = boundingBox; // add padding const pw = (x2 - x1) * paddingFactor; const ph = (y2 - y1) * paddingFactor; x1 = Math.max(0, x1 - pw); y1 = Math.max(0, y1 - ph); x2 = Math.min(1, x2 + pw); y2 = Math.min(1, y2 + ph); const left = Math.round(x1 * W); const top = Math.round(y1 * H); const width = Math.round((x2 - x1) * W); const height = Math.round((y2 - y1) * H); await sharp(imagePath) .extract({ left, top, width, height }) .jpeg({ quality: 95 }) .toFile(outputPath); return outputPath; } export async function detectProductFrames( frames: ExtractedFrame[], minConfidence: number, concurrency: number = 5, ): Promise { const model = createVisionModel(); // Pass 1: parallel filter — discard junk frames const keepFlags: boolean[] = []; for (let i = 0; i < frames.length; i += concurrency) { const chunk = frames.slice(i, i + concurrency); const flags = await Promise.all( chunk.map((f) => filterFrame(f, model).catch(() => false)) ); keepFlags.push(...flags); } const candidates = frames.filter((_, i) => keepFlags[i]); if (candidates.length === 0) return []; // Pass 2: single comparative call — model sees all candidates at once const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model); const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg'); await cropProduct(bestFrame.imagePath, boundingBox, croppedPath); return [{ frameIndex: bestFrame.frameIndex, timestampSeconds: bestFrame.timestampSeconds, imagePath: bestFrame.imagePath, croppedImagePath: croppedPath, confidence: 0.95, description, boundingHint: reasoning, }]; }