video-product-finder/src/product-detector.ts

175 lines
6.0 KiB
TypeScript
Raw Normal View History

import { generateObject } from 'ai';
import { createOpenAI } from '@ai-sdk/openai';
import { z } from 'zod';
import type { ExtractedFrame } from './frame-extractor.ts';
import type { ProductFrame } from './types.ts';
import { imageToBase64 } from './frame-extractor.ts';
import type { VisionConfig } from './index.ts';
// Pass 1: quick filter — discard frames that clearly have no product
const FilterSchema = z.object({
keep: z.boolean(),
reason: z.enum(['product_visible', 'content_only', 'hands_only', 'blur', 'transition', 'background_only']),
});
// Pass 2: comparative ranking across all candidates
const RankingSchema = z.object({
bestFrameIndex: z.number().int(),
description: z.string(),
reasoning: z.string(),
// normalized 0-1 relative to image dimensions: [x1, y1, x2, y2]
boundingBox: z.tuple([z.number(), z.number(), z.number(), z.number()]),
});
const FILTER_PROMPT = `You are filtering frames from a TikTok/Douyin ecommerce product video.
Keep a frame (keep=true) ONLY if the HERO PRODUCT (the main item being sold) is at least partially visible as a recognizable object.
Discard (keep=false) if: only hands/texture/contents visible, motion blur, black/transition frame, or no product at all.
reason options: product_visible | content_only | hands_only | blur | transition | background_only`;
const RANKING_PROMPT = (count: number) => `You are selecting the single best product image from ${count} video frames for ecommerce image search.
The frames are numbered 0 to ${count - 1} in the order shown.
Pick the ONE frame where the HERO PRODUCT is:
1. Cleanest fewest distractions, no hands blocking it, no clutter in foreground
2. Most complete full product silhouette visible, no edges cropped
3. Most isolated product stands out from background clearly
4. Empty/minimal load preferred a product without contents (e.g. an empty rack) beats one stuffed with items if both show the full structure equally
Return:
- bestFrameIndex: 0-based index of chosen frame
- description: concise search query under 12 words (product type + material + color + key feature)
- reasoning: one sentence explaining why this frame was chosen
- boundingBox: tight bounding box of the HERO PRODUCT ONLY in the chosen frame as [x1, y1, x2, y2] normalized 0.01.0 (top-left origin). Exclude hands, background, and unrelated objects. The product is assumed to be near the center.`;
function createVisionModel(config: VisionConfig) {
const provider = createOpenAI({ apiKey: config.apiKey, baseURL: config.baseURL });
return provider(config.model);
}
async function filterFrame(
frame: ExtractedFrame,
model: ReturnType<ReturnType<typeof createOpenAI>>,
): Promise<boolean> {
const base64Image = imageToBase64(frame.imagePath);
const { object } = await generateObject({
model,
schema: FilterSchema,
messages: [{
role: 'user',
content: [
{ type: 'image', image: `data:image/jpeg;base64,${base64Image}` },
{ type: 'text', text: FILTER_PROMPT },
],
}],
});
return object.keep;
}
async function rankCandidates(
candidates: ExtractedFrame[],
model: ReturnType<ReturnType<typeof createOpenAI>>,
): Promise<{ bestFrame: ExtractedFrame; description: string; reasoning: string; boundingBox: [number, number, number, number] }> {
const imageContent = candidates.map((f) => ({
type: 'image' as const,
image: `data:image/jpeg;base64,${imageToBase64(f.imagePath)}`,
}));
const { object } = await generateObject({
model,
schema: RankingSchema,
mode: 'json',
messages: [{
role: 'user',
content: [
...imageContent,
{ type: 'text', text: RANKING_PROMPT(candidates.length) },
],
}],
});
const idx = Math.max(0, Math.min(object.bestFrameIndex, candidates.length - 1));
return {
bestFrame: candidates[idx],
description: object.description,
reasoning: object.reasoning,
boundingBox: object.boundingBox,
};
}
export async function cropProduct(
imagePath: string,
boundingBox: [number, number, number, number],
outputPath: string,
paddingFactor = 0.05,
): Promise<string> {
const sharp = (await import('sharp')).default;
const meta = await sharp(imagePath).metadata();
const W = meta.width!;
const H = meta.height!;
let [x1, y1, x2, y2] = boundingBox;
// add padding
const pw = (x2 - x1) * paddingFactor;
const ph = (y2 - y1) * paddingFactor;
x1 = Math.max(0, x1 - pw);
y1 = Math.max(0, y1 - ph);
x2 = Math.min(1, x2 + pw);
y2 = Math.min(1, y2 + ph);
const left = Math.round(x1 * W);
const top = Math.round(y1 * H);
const width = Math.round((x2 - x1) * W);
const height = Math.round((y2 - y1) * H);
await sharp(imagePath)
.extract({ left, top, width, height })
.jpeg({ quality: 95 })
.toFile(outputPath);
return outputPath;
}
export async function detectProductFrames(
frames: ExtractedFrame[],
minConfidence: number,
concurrency: number = 5,
visionConfig: VisionConfig,
): Promise<ProductFrame[]> {
const model = createVisionModel(visionConfig);
// Pass 1: parallel filter — discard junk frames
const keepFlags: boolean[] = [];
for (let i = 0; i < frames.length; i += concurrency) {
const chunk = frames.slice(i, i + concurrency);
const flags = await Promise.all(
chunk.map((f) => filterFrame(f, model).catch(() => false))
);
keepFlags.push(...flags);
}
const candidates = frames.filter((_, i) => keepFlags[i]);
if (candidates.length === 0) return [];
// Pass 2: single comparative call — model sees all candidates at once
const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);
const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
return [{
frameIndex: bestFrame.frameIndex,
timestampSeconds: bestFrame.timestampSeconds,
imagePath: bestFrame.imagePath,
croppedImagePath: croppedPath,
confidence: 0.95,
description,
boundingHint: reasoning,
}];
}