180 lines
6.0 KiB
TypeScript
180 lines
6.0 KiB
TypeScript
import { generateObject } from 'ai';
|
||
import { createOpenAI } from '@ai-sdk/openai';
|
||
import { z } from 'zod';
|
||
import type { ExtractedFrame } from './frame-extractor.ts';
|
||
import type { ProductFrame } from './types.ts';
|
||
import { imageToBase64 } from './frame-extractor.ts';
|
||
|
||
// Pass 1: quick filter — discard frames that clearly have no product
|
||
const FilterSchema = z.object({
|
||
keep: z.boolean(),
|
||
reason: z.enum(['product_visible', 'content_only', 'hands_only', 'blur', 'transition', 'background_only']),
|
||
});
|
||
|
||
// Pass 2: comparative ranking across all candidates
|
||
const RankingSchema = z.object({
|
||
bestFrameIndex: z.number().int(),
|
||
description: z.string(),
|
||
reasoning: z.string(),
|
||
// normalized 0-1 relative to image dimensions: [x1, y1, x2, y2]
|
||
boundingBox: z.tuple([z.number(), z.number(), z.number(), z.number()]),
|
||
});
|
||
|
||
const FILTER_PROMPT = `You are filtering frames from a TikTok/Douyin ecommerce product video.
|
||
|
||
Keep a frame (keep=true) ONLY if the HERO PRODUCT (the main item being sold) is at least partially visible as a recognizable object.
|
||
Discard (keep=false) if: only hands/texture/contents visible, motion blur, black/transition frame, or no product at all.
|
||
|
||
reason options: product_visible | content_only | hands_only | blur | transition | background_only`;
|
||
|
||
const RANKING_PROMPT = (count: number) => `You are selecting the single best product image from ${count} video frames for ecommerce image search.
|
||
|
||
The frames are numbered 0 to ${count - 1} in the order shown.
|
||
|
||
Pick the ONE frame where the HERO PRODUCT is:
|
||
1. Cleanest — fewest distractions, no hands blocking it, no clutter in foreground
|
||
2. Most complete — full product silhouette visible, no edges cropped
|
||
3. Most isolated — product stands out from background clearly
|
||
4. Empty/minimal load preferred — a product without contents (e.g. an empty rack) beats one stuffed with items if both show the full structure equally
|
||
|
||
Return:
|
||
- bestFrameIndex: 0-based index of chosen frame
|
||
- description: concise search query under 12 words (product type + material + color + key feature)
|
||
- reasoning: one sentence explaining why this frame was chosen
|
||
- boundingBox: tight bounding box of the HERO PRODUCT ONLY in the chosen frame as [x1, y1, x2, y2] normalized 0.0–1.0 (top-left origin). Exclude hands, background, and unrelated objects. The product is assumed to be near the center.`;
|
||
|
||
function createVisionModel() {
|
||
const apiKey = process.env.VISION_API_KEY;
|
||
if (!apiKey) throw new Error('VISION_API_KEY not set');
|
||
|
||
const provider = createOpenAI({
|
||
apiKey,
|
||
baseURL: process.env.VISION_API_BASE,
|
||
});
|
||
|
||
return provider(process.env.VISION_MODEL ?? 'gpt-4o-mini');
|
||
}
|
||
|
||
async function filterFrame(
|
||
frame: ExtractedFrame,
|
||
model: ReturnType<ReturnType<typeof createOpenAI>>,
|
||
): Promise<boolean> {
|
||
const base64Image = imageToBase64(frame.imagePath);
|
||
|
||
const { object } = await generateObject({
|
||
model,
|
||
schema: FilterSchema,
|
||
messages: [{
|
||
role: 'user',
|
||
content: [
|
||
{ type: 'image', image: `data:image/jpeg;base64,${base64Image}` },
|
||
{ type: 'text', text: FILTER_PROMPT },
|
||
],
|
||
}],
|
||
});
|
||
|
||
return object.keep;
|
||
}
|
||
|
||
async function rankCandidates(
|
||
candidates: ExtractedFrame[],
|
||
model: ReturnType<ReturnType<typeof createOpenAI>>,
|
||
): Promise<{ bestFrame: ExtractedFrame; description: string; reasoning: string; boundingBox: [number, number, number, number] }> {
|
||
const imageContent = candidates.map((f) => ({
|
||
type: 'image' as const,
|
||
image: `data:image/jpeg;base64,${imageToBase64(f.imagePath)}`,
|
||
}));
|
||
|
||
const { object } = await generateObject({
|
||
model,
|
||
schema: RankingSchema,
|
||
mode: 'json',
|
||
messages: [{
|
||
role: 'user',
|
||
content: [
|
||
...imageContent,
|
||
{ type: 'text', text: RANKING_PROMPT(candidates.length) },
|
||
],
|
||
}],
|
||
});
|
||
|
||
const idx = Math.max(0, Math.min(object.bestFrameIndex, candidates.length - 1));
|
||
return {
|
||
bestFrame: candidates[idx],
|
||
description: object.description,
|
||
reasoning: object.reasoning,
|
||
boundingBox: object.boundingBox,
|
||
};
|
||
}
|
||
|
||
export async function cropProduct(
|
||
imagePath: string,
|
||
boundingBox: [number, number, number, number],
|
||
outputPath: string,
|
||
paddingFactor = 0.05,
|
||
): Promise<string> {
|
||
const sharp = (await import('sharp')).default;
|
||
const meta = await sharp(imagePath).metadata();
|
||
const W = meta.width!;
|
||
const H = meta.height!;
|
||
|
||
let [x1, y1, x2, y2] = boundingBox;
|
||
|
||
// add padding
|
||
const pw = (x2 - x1) * paddingFactor;
|
||
const ph = (y2 - y1) * paddingFactor;
|
||
x1 = Math.max(0, x1 - pw);
|
||
y1 = Math.max(0, y1 - ph);
|
||
x2 = Math.min(1, x2 + pw);
|
||
y2 = Math.min(1, y2 + ph);
|
||
|
||
const left = Math.round(x1 * W);
|
||
const top = Math.round(y1 * H);
|
||
const width = Math.round((x2 - x1) * W);
|
||
const height = Math.round((y2 - y1) * H);
|
||
|
||
await sharp(imagePath)
|
||
.extract({ left, top, width, height })
|
||
.jpeg({ quality: 95 })
|
||
.toFile(outputPath);
|
||
|
||
return outputPath;
|
||
}
|
||
|
||
export async function detectProductFrames(
|
||
frames: ExtractedFrame[],
|
||
minConfidence: number,
|
||
concurrency: number = 5,
|
||
): Promise<ProductFrame[]> {
|
||
const model = createVisionModel();
|
||
|
||
// Pass 1: parallel filter — discard junk frames
|
||
const keepFlags: boolean[] = [];
|
||
for (let i = 0; i < frames.length; i += concurrency) {
|
||
const chunk = frames.slice(i, i + concurrency);
|
||
const flags = await Promise.all(
|
||
chunk.map((f) => filterFrame(f, model).catch(() => false))
|
||
);
|
||
keepFlags.push(...flags);
|
||
}
|
||
|
||
const candidates = frames.filter((_, i) => keepFlags[i]);
|
||
if (candidates.length === 0) return [];
|
||
|
||
// Pass 2: single comparative call — model sees all candidates at once
|
||
const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);
|
||
|
||
const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
|
||
await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
|
||
|
||
return [{
|
||
frameIndex: bestFrame.frameIndex,
|
||
timestampSeconds: bestFrame.timestampSeconds,
|
||
imagePath: bestFrame.imagePath,
|
||
croppedImagePath: croppedPath,
|
||
confidence: 0.95,
|
||
description,
|
||
boundingHint: reasoning,
|
||
}];
|
||
}
|