video-product-finder/src/product-detector.ts

345 lines
11 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { generateObject } from 'ai';
import { createOpenAI } from '@ai-sdk/openai';
import { z } from 'zod';
import type { ExtractedFrame } from './frame-extractor.ts';
import type { ProductFrame } from './types.ts';
import { imageToBase64 } from './frame-extractor.ts';
import type { VisionConfig } from './index.ts';
// Pass 1: quick filter — discard frames that clearly have no product
const FilterSchema = z.object({
keep: z.boolean(),
reason: z.enum(['product_visible', 'content_only', 'hands_only', 'blur', 'transition', 'background_only']),
});
// Pass 2: comparative ranking across all candidates
const RankingSchema = z.object({
bestFrameIndex: z.number().int(),
description: z.string(),
reasoning: z.string(),
// normalized 0-1 relative to image dimensions: [x1, y1, x2, y2]
boundingBox: z.tuple([z.number(), z.number(), z.number(), z.number()]),
});
const FILTER_PROMPT = `You are filtering frames from a TikTok/Douyin ecommerce product video.
Keep a frame (keep=true) ONLY if the HERO PRODUCT (the main item being sold) is at least partially visible as a recognizable object.
Discard (keep=false) if: only hands/texture/contents visible, motion blur, black/transition frame, or no product at all.
reason options: product_visible | content_only | hands_only | blur | transition | background_only`;
const RANKING_PROMPT = (count: number) => `You are selecting the single best product frame from ${count} video frames for ecommerce search.
Frames are numbered 0 to ${count - 1} in order shown.
IMPORTANT: You MUST pick ONE frame — even if product visibility is imperfect or no frame looks ideal. Always make your best guess.
Pick the frame where the MAIN SELLING PRODUCT is:
1. Most recognizable — clearest view of the item being sold
2. Most complete — full product silhouette visible, not cropped at edges
3. Cleanest — minimal obstruction (hands, clutter, motion blur, labels)
4. Best lit and in focus
Return:
- bestFrameIndex: 0-based index
- description: concise search query under 12 words (product type + material + color + key features), in Chinese
- reasoning: one sentence explaining choice
- boundingBox: tight box of the PRODUCT ONLY as [x1, y1, x2, y2] normalized 0.01.0, top-left origin. Exclude hands, background, and unrelated objects. The product is near the center of the frame.`;
function createVisionModel(config: VisionConfig) {
const provider = createOpenAI({ apiKey: config.apiKey, baseURL: config.baseURL });
return provider(config.model);
}
async function filterFrame(
frame: ExtractedFrame,
model: ReturnType<ReturnType<typeof createOpenAI>>,
): Promise<boolean> {
const base64Image = imageToBase64(frame.imagePath);
const { object } = await generateObject({
model,
schema: FilterSchema,
messages: [{
role: 'user',
content: [
{ type: 'image', image: `data:image/jpeg;base64,${base64Image}` },
{ type: 'text', text: FILTER_PROMPT },
],
}],
});
return object.keep;
}
async function rankCandidates(
candidates: ExtractedFrame[],
model: ReturnType<ReturnType<typeof createOpenAI>>,
): Promise<{ bestFrame: ExtractedFrame; description: string; reasoning: string; boundingBox: [number, number, number, number] }> {
const imageContent = candidates.map((f) => ({
type: 'image' as const,
image: `data:image/jpeg;base64,${imageToBase64(f.imagePath)}`,
}));
const { object } = await generateObject({
model,
schema: RankingSchema,
mode: 'json',
messages: [{
role: 'user',
content: [
...imageContent,
{ type: 'text', text: RANKING_PROMPT(candidates.length) },
],
}],
});
const idx = Math.max(0, Math.min(object.bestFrameIndex, candidates.length - 1));
return {
bestFrame: candidates[idx],
description: object.description,
reasoning: object.reasoning,
boundingBox: object.boundingBox,
};
}
export async function cropProduct(
imagePath: string,
boundingBox: [number, number, number, number],
outputPath: string,
paddingFactor = 0.05,
): Promise<string> {
const sharp = (await import('sharp')).default;
const meta = await sharp(imagePath).metadata();
const W = meta.width!;
const H = meta.height!;
let [x1, y1, x2, y2] = boundingBox;
// Normalize coords: ensure x1<x2 and y1<y2
if (x1 > x2) [x1, x2] = [x2, x1];
if (y1 > y2) [y1, y2] = [y2, y1];
// Clamp to [0, 1]
x1 = Math.max(0, Math.min(1, x1));
y1 = Math.max(0, Math.min(1, y1));
x2 = Math.max(0, Math.min(1, x2));
y2 = Math.max(0, Math.min(1, y2));
// Add padding
const pw = (x2 - x1) * paddingFactor;
const ph = (y2 - y1) * paddingFactor;
x1 = Math.max(0, x1 - pw);
y1 = Math.max(0, y1 - ph);
x2 = Math.min(1, x2 + pw);
y2 = Math.min(1, y2 + ph);
// Validate minimum area
if (x2 - x1 < 0.005 || y2 - y1 < 0.005) {
throw new Error('bounding box too small after normalization');
}
const left = Math.round(x1 * W);
const top = Math.round(y1 * H);
const width = Math.round((x2 - x1) * W);
const height = Math.round((y2 - y1) * H);
await sharp(imagePath)
.extract({ left, top, width, height })
.jpeg({ quality: 95 })
.toFile(outputPath);
return outputPath;
}
async function withConcurrency<T>(
tasks: (() => Promise<T>)[],
limit: number,
): Promise<T[]> {
const results: T[] = new Array(tasks.length);
let next = 0;
async function worker() {
while (next < tasks.length) {
const i = next++;
results[i] = await tasks[i]();
}
}
await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, worker));
return results;
}
// ── Frame quality pre-filtering ──────────────────────────────────────
interface FrameQuality {
valid: boolean;
meanBrightness: number;
variance: number;
}
async function assessFrameQuality(imagePath: string): Promise<FrameQuality> {
const sharp = (await import('sharp')).default;
const { data, info } = await sharp(imagePath)
.grayscale()
.raw()
.toBuffer({ resolveWithObject: true });
const pixels = new Uint8Array(data);
let sum = 0;
let sumSq = 0;
for (let i = 0; i < pixels.length; i++) {
sum += pixels[i];
sumSq += pixels[i] * pixels[i];
}
const mean = sum / pixels.length;
const variance = sumSq / pixels.length - mean * mean;
// Skip near-black, near-white, or very low variance (blurry/blank/transition)
const valid = mean > 15 && mean < 240 && variance > 50;
return { valid, meanBrightness: mean, variance };
}
async function filterQualityFrames(frames: ExtractedFrame[]): Promise<ExtractedFrame[]> {
const results = await Promise.all(
frames.map(async (frame) => {
try {
const q = await assessFrameQuality(frame.imagePath);
return { frame, valid: q.valid };
} catch {
return { frame, valid: true };
}
}),
);
const valid = results.filter(r => r.valid).map(r => r.frame);
return valid.length > 0 ? valid : frames;
}
function isValidBoundingBox(bbox: [number, number, number, number]): boolean {
const [x1, y1, x2, y2] = bbox;
return (
x1 >= 0 && x1 <= 1 &&
y1 >= 0 && y1 <= 1 &&
x2 >= 0 && x2 <= 1 &&
y2 >= 0 && y2 <= 1 &&
x1 < x2 &&
y1 < y2 &&
(x2 - x1) * (y2 - y1) > 0.005
);
}
// Skips Pass 1 filter entirely — ranks all frames and always returns the best one.
// Evenly samples down to maxCandidates when there are too many frames.
export async function detectBestFrame(
frames: ExtractedFrame[],
visionConfig: VisionConfig,
maxCandidates: number = 20,
): Promise<ProductFrame | null> {
if (frames.length === 0) return null;
// 1. Filter out obviously bad frames (black, white, blurry)
let candidates = await filterQualityFrames(frames);
// 2. Sample if too many
if (candidates.length > maxCandidates) {
const step = candidates.length / maxCandidates;
candidates = Array.from({ length: maxCandidates }, (_, i) => candidates[Math.floor(i * step)]);
}
const model = createVisionModel(visionConfig);
// 3. Try Vision ranking with error isolation
try {
const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);
if (isValidBoundingBox(boundingBox)) {
const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
try {
await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
} catch {
// cropping is optional — keep original frame
}
return {
frameIndex: bestFrame.frameIndex,
timestampSeconds: bestFrame.timestampSeconds,
imagePath: bestFrame.imagePath,
...(croppedPath ? { croppedImagePath: croppedPath } : {}),
confidence: 0.95,
description,
boundingHint: reasoning,
};
}
} catch {
// Vision ranking failed — fall through to fallback
}
// 4. Fallback: rank by frame quality (variance) and return the sharpest
const withQuality = await Promise.all(
candidates.map(async (f) => {
try {
const q = await assessFrameQuality(f.imagePath);
return { frame: f, score: q.variance };
} catch {
return { frame: f, score: 0 };
}
}),
);
withQuality.sort((a, b) => b.score - a.score);
const best = withQuality[0].frame;
return {
frameIndex: best.frameIndex,
timestampSeconds: best.timestampSeconds,
imagePath: best.imagePath,
confidence: 0.5,
description: 'product frame (auto-selected)',
boundingHint: 'picked by frame quality analysis (Vision ranking failed)',
};
}
export async function detectProductFrames(
frames: ExtractedFrame[],
minConfidence: number,
concurrency: number = 10,
visionConfig: VisionConfig,
): Promise<ProductFrame[]> {
const model = createVisionModel(visionConfig);
// Pass 1: all frames in parallel, bounded by concurrency
const keepFlags = await withConcurrency(
frames.map((f) => () => filterFrame(f, model).catch(() => false)),
concurrency,
);
const candidates = frames.filter((_, i) => keepFlags[i]);
if (candidates.length === 0) return [];
// Pass 2: single comparative call — model sees all candidates at once
let bestSnapshot: ProductFrame | undefined;
try {
const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);
if (isValidBoundingBox(boundingBox)) {
const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
try {
await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
} catch {}
bestSnapshot = {
frameIndex: bestFrame.frameIndex,
timestampSeconds: bestFrame.timestampSeconds,
imagePath: bestFrame.imagePath,
...(croppedPath ? { croppedImagePath: croppedPath } : {}),
confidence: 0.95,
description,
boundingHint: reasoning,
};
}
} catch {
// ranking failed
}
if (!bestSnapshot) {
return [];
}
return [bestSnapshot];
}