import { generateObject } from 'ai'; import { createOpenAI } from '@ai-sdk/openai'; import { z } from 'zod'; import type { ExtractedFrame } from './frame-extractor.ts'; import type { ProductFrame } from './types.ts'; import { imageToBase64 } from './frame-extractor.ts'; import type { VisionConfig } from './index.ts'; // Pass 1: quick filter — discard frames that clearly have no product const FilterSchema = z.object({ keep: z.boolean(), reason: z.enum(['product_visible', 'content_only', 'hands_only', 'blur', 'transition', 'background_only']), }); // Pass 2: comparative ranking across all candidates const RankingSchema = z.object({ bestFrameIndex: z.number().int(), description: z.string(), reasoning: z.string(), // normalized 0-1 relative to image dimensions: [x1, y1, x2, y2] boundingBox: z.tuple([z.number(), z.number(), z.number(), z.number()]), }); const FILTER_PROMPT = `You are filtering frames from a TikTok/Douyin ecommerce product video. Keep a frame (keep=true) ONLY if the HERO PRODUCT (the main item being sold) is at least partially visible as a recognizable object. Discard (keep=false) if: only hands/texture/contents visible, motion blur, black/transition frame, or no product at all. reason options: product_visible | content_only | hands_only | blur | transition | background_only`; const RANKING_PROMPT = (count: number) => `You are selecting the single best product frame from ${count} video frames for ecommerce search. Frames are numbered 0 to ${count - 1} in order shown. IMPORTANT: You MUST pick ONE frame — even if product visibility is imperfect or no frame looks ideal. Always make your best guess. Pick the frame where the MAIN SELLING PRODUCT is: 1. Most recognizable — clearest view of the item being sold 2. Most complete — full product silhouette visible, not cropped at edges 3. Cleanest — minimal obstruction (hands, clutter, motion blur, labels) 4. Best lit and in focus Return: - bestFrameIndex: 0-based index - description: concise search query under 12 words (product type + material + color + key features), in Chinese - reasoning: one sentence explaining choice - boundingBox: tight box of the PRODUCT ONLY as [x1, y1, x2, y2] normalized 0.0–1.0, top-left origin. Exclude hands, background, and unrelated objects. The product is near the center of the frame.`; function createVisionModel(config: VisionConfig) { const provider = createOpenAI({ apiKey: config.apiKey, baseURL: config.baseURL }); return provider(config.model); } async function filterFrame( frame: ExtractedFrame, model: ReturnType>, ): Promise { const base64Image = imageToBase64(frame.imagePath); const { object } = await generateObject({ model, schema: FilterSchema, messages: [{ role: 'user', content: [ { type: 'image', image: `data:image/jpeg;base64,${base64Image}` }, { type: 'text', text: FILTER_PROMPT }, ], }], }); return object.keep; } async function rankCandidates( candidates: ExtractedFrame[], model: ReturnType>, ): Promise<{ bestFrame: ExtractedFrame; description: string; reasoning: string; boundingBox: [number, number, number, number] }> { const imageContent = candidates.map((f) => ({ type: 'image' as const, image: `data:image/jpeg;base64,${imageToBase64(f.imagePath)}`, })); const { object } = await generateObject({ model, schema: RankingSchema, mode: 'json', messages: [{ role: 'user', content: [ ...imageContent, { type: 'text', text: RANKING_PROMPT(candidates.length) }, ], }], }); const idx = Math.max(0, Math.min(object.bestFrameIndex, candidates.length - 1)); return { bestFrame: candidates[idx], description: object.description, reasoning: object.reasoning, boundingBox: object.boundingBox, }; } export async function cropProduct( imagePath: string, boundingBox: [number, number, number, number], outputPath: string, paddingFactor = 0.05, ): Promise { const sharp = (await import('sharp')).default; const meta = await sharp(imagePath).metadata(); const W = meta.width!; const H = meta.height!; let [x1, y1, x2, y2] = boundingBox; // Normalize coords: ensure x1 x2) [x1, x2] = [x2, x1]; if (y1 > y2) [y1, y2] = [y2, y1]; // Clamp to [0, 1] x1 = Math.max(0, Math.min(1, x1)); y1 = Math.max(0, Math.min(1, y1)); x2 = Math.max(0, Math.min(1, x2)); y2 = Math.max(0, Math.min(1, y2)); // Add padding const pw = (x2 - x1) * paddingFactor; const ph = (y2 - y1) * paddingFactor; x1 = Math.max(0, x1 - pw); y1 = Math.max(0, y1 - ph); x2 = Math.min(1, x2 + pw); y2 = Math.min(1, y2 + ph); // Validate minimum area if (x2 - x1 < 0.005 || y2 - y1 < 0.005) { throw new Error('bounding box too small after normalization'); } const left = Math.round(x1 * W); const top = Math.round(y1 * H); const width = Math.round((x2 - x1) * W); const height = Math.round((y2 - y1) * H); await sharp(imagePath) .extract({ left, top, width, height }) .jpeg({ quality: 95 }) .toFile(outputPath); return outputPath; } async function withConcurrency( tasks: (() => Promise)[], limit: number, ): Promise { const results: T[] = new Array(tasks.length); let next = 0; async function worker() { while (next < tasks.length) { const i = next++; results[i] = await tasks[i](); } } await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, worker)); return results; } // ── Frame quality pre-filtering ────────────────────────────────────── interface FrameQuality { valid: boolean; meanBrightness: number; variance: number; } async function assessFrameQuality(imagePath: string): Promise { const sharp = (await import('sharp')).default; const { data, info } = await sharp(imagePath) .grayscale() .raw() .toBuffer({ resolveWithObject: true }); const pixels = new Uint8Array(data); let sum = 0; let sumSq = 0; for (let i = 0; i < pixels.length; i++) { sum += pixels[i]; sumSq += pixels[i] * pixels[i]; } const mean = sum / pixels.length; const variance = sumSq / pixels.length - mean * mean; // Skip near-black, near-white, or very low variance (blurry/blank/transition) const valid = mean > 15 && mean < 240 && variance > 50; return { valid, meanBrightness: mean, variance }; } async function filterQualityFrames(frames: ExtractedFrame[]): Promise { const results = await Promise.all( frames.map(async (frame) => { try { const q = await assessFrameQuality(frame.imagePath); return { frame, valid: q.valid }; } catch { return { frame, valid: true }; } }), ); const valid = results.filter(r => r.valid).map(r => r.frame); return valid.length > 0 ? valid : frames; } function isValidBoundingBox(bbox: [number, number, number, number]): boolean { const [x1, y1, x2, y2] = bbox; return ( x1 >= 0 && x1 <= 1 && y1 >= 0 && y1 <= 1 && x2 >= 0 && x2 <= 1 && y2 >= 0 && y2 <= 1 && x1 < x2 && y1 < y2 && (x2 - x1) * (y2 - y1) > 0.005 ); } // Skips Pass 1 filter entirely — ranks all frames and always returns the best one. // Evenly samples down to maxCandidates when there are too many frames. export async function detectBestFrame( frames: ExtractedFrame[], visionConfig: VisionConfig, maxCandidates: number = 20, ): Promise { if (frames.length === 0) return null; // 1. Filter out obviously bad frames (black, white, blurry) let candidates = await filterQualityFrames(frames); // 2. Sample if too many if (candidates.length > maxCandidates) { const step = candidates.length / maxCandidates; candidates = Array.from({ length: maxCandidates }, (_, i) => candidates[Math.floor(i * step)]); } const model = createVisionModel(visionConfig); // 3. Try Vision ranking with error isolation try { const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model); if (isValidBoundingBox(boundingBox)) { const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg'); try { await cropProduct(bestFrame.imagePath, boundingBox, croppedPath); } catch { // cropping is optional — keep original frame } return { frameIndex: bestFrame.frameIndex, timestampSeconds: bestFrame.timestampSeconds, imagePath: bestFrame.imagePath, ...(croppedPath ? { croppedImagePath: croppedPath } : {}), confidence: 0.95, description, boundingHint: reasoning, }; } } catch { // Vision ranking failed — fall through to fallback } // 4. Fallback: rank by frame quality (variance) and return the sharpest const withQuality = await Promise.all( candidates.map(async (f) => { try { const q = await assessFrameQuality(f.imagePath); return { frame: f, score: q.variance }; } catch { return { frame: f, score: 0 }; } }), ); withQuality.sort((a, b) => b.score - a.score); const best = withQuality[0].frame; return { frameIndex: best.frameIndex, timestampSeconds: best.timestampSeconds, imagePath: best.imagePath, confidence: 0.5, description: 'product frame (auto-selected)', boundingHint: 'picked by frame quality analysis (Vision ranking failed)', }; } export async function detectProductFrames( frames: ExtractedFrame[], minConfidence: number, concurrency: number = 10, visionConfig: VisionConfig, ): Promise { const model = createVisionModel(visionConfig); // Pass 1: all frames in parallel, bounded by concurrency const keepFlags = await withConcurrency( frames.map((f) => () => filterFrame(f, model).catch(() => false)), concurrency, ); const candidates = frames.filter((_, i) => keepFlags[i]); if (candidates.length === 0) return []; // Pass 2: single comparative call — model sees all candidates at once let bestSnapshot: ProductFrame | undefined; try { const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model); if (isValidBoundingBox(boundingBox)) { const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg'); try { await cropProduct(bestFrame.imagePath, boundingBox, croppedPath); } catch {} bestSnapshot = { frameIndex: bestFrame.frameIndex, timestampSeconds: bestFrame.timestampSeconds, imagePath: bestFrame.imagePath, ...(croppedPath ? { croppedImagePath: croppedPath } : {}), confidence: 0.95, description, boundingHint: reasoning, }; } } catch { // ranking failed } if (!bestSnapshot) { return []; } return [bestSnapshot]; }