2026-04-19 23:24:28 +00:00
|
|
|
|
import { generateObject } from 'ai';
|
|
|
|
|
|
import { createOpenAI } from '@ai-sdk/openai';
|
|
|
|
|
|
import { z } from 'zod';
|
|
|
|
|
|
import type { ExtractedFrame } from './frame-extractor.ts';
|
|
|
|
|
|
import type { ProductFrame } from './types.ts';
|
|
|
|
|
|
import { imageToBase64 } from './frame-extractor.ts';
|
2026-04-20 04:14:43 +00:00
|
|
|
|
import type { VisionConfig } from './index.ts';
|
2026-04-19 23:24:28 +00:00
|
|
|
|
|
|
|
|
|
|
// Pass 1: quick filter — discard frames that clearly have no product
|
|
|
|
|
|
const FilterSchema = z.object({
|
|
|
|
|
|
keep: z.boolean(),
|
|
|
|
|
|
reason: z.enum(['product_visible', 'content_only', 'hands_only', 'blur', 'transition', 'background_only']),
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
// Pass 2: comparative ranking across all candidates
|
|
|
|
|
|
const RankingSchema = z.object({
|
|
|
|
|
|
bestFrameIndex: z.number().int(),
|
|
|
|
|
|
description: z.string(),
|
|
|
|
|
|
reasoning: z.string(),
|
|
|
|
|
|
// normalized 0-1 relative to image dimensions: [x1, y1, x2, y2]
|
|
|
|
|
|
boundingBox: z.tuple([z.number(), z.number(), z.number(), z.number()]),
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
const FILTER_PROMPT = `You are filtering frames from a TikTok/Douyin ecommerce product video.
|
|
|
|
|
|
|
|
|
|
|
|
Keep a frame (keep=true) ONLY if the HERO PRODUCT (the main item being sold) is at least partially visible as a recognizable object.
|
|
|
|
|
|
Discard (keep=false) if: only hands/texture/contents visible, motion blur, black/transition frame, or no product at all.
|
|
|
|
|
|
|
|
|
|
|
|
reason options: product_visible | content_only | hands_only | blur | transition | background_only`;
|
|
|
|
|
|
|
2026-04-25 08:30:01 +00:00
|
|
|
|
const RANKING_PROMPT = (count: number) => `You are selecting the single best product frame from ${count} video frames for ecommerce search.
|
2026-04-19 23:24:28 +00:00
|
|
|
|
|
2026-04-25 08:30:01 +00:00
|
|
|
|
Frames are numbered 0 to ${count - 1} in order shown.
|
2026-04-19 23:24:28 +00:00
|
|
|
|
|
2026-04-25 08:30:01 +00:00
|
|
|
|
IMPORTANT: You MUST pick ONE frame — even if product visibility is imperfect or no frame looks ideal. Always make your best guess.
|
|
|
|
|
|
|
|
|
|
|
|
Pick the frame where the MAIN SELLING PRODUCT is:
|
|
|
|
|
|
1. Most recognizable — clearest view of the item being sold
|
|
|
|
|
|
2. Most complete — full product silhouette visible, not cropped at edges
|
|
|
|
|
|
3. Cleanest — minimal obstruction (hands, clutter, motion blur, labels)
|
|
|
|
|
|
4. Best lit and in focus
|
2026-04-19 23:24:28 +00:00
|
|
|
|
|
|
|
|
|
|
Return:
|
2026-04-25 08:30:01 +00:00
|
|
|
|
- bestFrameIndex: 0-based index
|
|
|
|
|
|
- description: concise search query under 12 words (product type + material + color + key features), in Chinese
|
|
|
|
|
|
- reasoning: one sentence explaining choice
|
|
|
|
|
|
- boundingBox: tight box of the PRODUCT ONLY as [x1, y1, x2, y2] normalized 0.0–1.0, top-left origin. Exclude hands, background, and unrelated objects. The product is near the center of the frame.`;
|
2026-04-19 23:24:28 +00:00
|
|
|
|
|
2026-04-20 04:14:43 +00:00
|
|
|
|
function createVisionModel(config: VisionConfig) {
|
|
|
|
|
|
const provider = createOpenAI({ apiKey: config.apiKey, baseURL: config.baseURL });
|
|
|
|
|
|
return provider(config.model);
|
2026-04-19 23:24:28 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async function filterFrame(
|
|
|
|
|
|
frame: ExtractedFrame,
|
|
|
|
|
|
model: ReturnType<ReturnType<typeof createOpenAI>>,
|
|
|
|
|
|
): Promise<boolean> {
|
|
|
|
|
|
const base64Image = imageToBase64(frame.imagePath);
|
|
|
|
|
|
|
|
|
|
|
|
const { object } = await generateObject({
|
|
|
|
|
|
model,
|
|
|
|
|
|
schema: FilterSchema,
|
|
|
|
|
|
messages: [{
|
|
|
|
|
|
role: 'user',
|
|
|
|
|
|
content: [
|
|
|
|
|
|
{ type: 'image', image: `data:image/jpeg;base64,${base64Image}` },
|
|
|
|
|
|
{ type: 'text', text: FILTER_PROMPT },
|
|
|
|
|
|
],
|
|
|
|
|
|
}],
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
return object.keep;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async function rankCandidates(
|
|
|
|
|
|
candidates: ExtractedFrame[],
|
|
|
|
|
|
model: ReturnType<ReturnType<typeof createOpenAI>>,
|
|
|
|
|
|
): Promise<{ bestFrame: ExtractedFrame; description: string; reasoning: string; boundingBox: [number, number, number, number] }> {
|
|
|
|
|
|
const imageContent = candidates.map((f) => ({
|
|
|
|
|
|
type: 'image' as const,
|
|
|
|
|
|
image: `data:image/jpeg;base64,${imageToBase64(f.imagePath)}`,
|
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
|
|
const { object } = await generateObject({
|
|
|
|
|
|
model,
|
|
|
|
|
|
schema: RankingSchema,
|
|
|
|
|
|
mode: 'json',
|
|
|
|
|
|
messages: [{
|
|
|
|
|
|
role: 'user',
|
|
|
|
|
|
content: [
|
|
|
|
|
|
...imageContent,
|
|
|
|
|
|
{ type: 'text', text: RANKING_PROMPT(candidates.length) },
|
|
|
|
|
|
],
|
|
|
|
|
|
}],
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
const idx = Math.max(0, Math.min(object.bestFrameIndex, candidates.length - 1));
|
|
|
|
|
|
return {
|
|
|
|
|
|
bestFrame: candidates[idx],
|
|
|
|
|
|
description: object.description,
|
|
|
|
|
|
reasoning: object.reasoning,
|
|
|
|
|
|
boundingBox: object.boundingBox,
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
export async function cropProduct(
|
|
|
|
|
|
imagePath: string,
|
|
|
|
|
|
boundingBox: [number, number, number, number],
|
|
|
|
|
|
outputPath: string,
|
|
|
|
|
|
paddingFactor = 0.05,
|
|
|
|
|
|
): Promise<string> {
|
|
|
|
|
|
const sharp = (await import('sharp')).default;
|
|
|
|
|
|
const meta = await sharp(imagePath).metadata();
|
|
|
|
|
|
const W = meta.width!;
|
|
|
|
|
|
const H = meta.height!;
|
|
|
|
|
|
|
|
|
|
|
|
let [x1, y1, x2, y2] = boundingBox;
|
|
|
|
|
|
|
2026-04-25 08:30:01 +00:00
|
|
|
|
// Normalize coords: ensure x1<x2 and y1<y2
|
|
|
|
|
|
if (x1 > x2) [x1, x2] = [x2, x1];
|
|
|
|
|
|
if (y1 > y2) [y1, y2] = [y2, y1];
|
|
|
|
|
|
|
|
|
|
|
|
// Clamp to [0, 1]
|
|
|
|
|
|
x1 = Math.max(0, Math.min(1, x1));
|
|
|
|
|
|
y1 = Math.max(0, Math.min(1, y1));
|
|
|
|
|
|
x2 = Math.max(0, Math.min(1, x2));
|
|
|
|
|
|
y2 = Math.max(0, Math.min(1, y2));
|
|
|
|
|
|
|
|
|
|
|
|
// Add padding
|
2026-04-19 23:24:28 +00:00
|
|
|
|
const pw = (x2 - x1) * paddingFactor;
|
|
|
|
|
|
const ph = (y2 - y1) * paddingFactor;
|
|
|
|
|
|
x1 = Math.max(0, x1 - pw);
|
|
|
|
|
|
y1 = Math.max(0, y1 - ph);
|
|
|
|
|
|
x2 = Math.min(1, x2 + pw);
|
|
|
|
|
|
y2 = Math.min(1, y2 + ph);
|
|
|
|
|
|
|
2026-04-25 08:30:01 +00:00
|
|
|
|
// Validate minimum area
|
|
|
|
|
|
if (x2 - x1 < 0.005 || y2 - y1 < 0.005) {
|
|
|
|
|
|
throw new Error('bounding box too small after normalization');
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-19 23:24:28 +00:00
|
|
|
|
const left = Math.round(x1 * W);
|
|
|
|
|
|
const top = Math.round(y1 * H);
|
|
|
|
|
|
const width = Math.round((x2 - x1) * W);
|
|
|
|
|
|
const height = Math.round((y2 - y1) * H);
|
|
|
|
|
|
|
|
|
|
|
|
await sharp(imagePath)
|
|
|
|
|
|
.extract({ left, top, width, height })
|
|
|
|
|
|
.jpeg({ quality: 95 })
|
|
|
|
|
|
.toFile(outputPath);
|
|
|
|
|
|
|
|
|
|
|
|
return outputPath;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-21 00:20:37 +00:00
|
|
|
|
async function withConcurrency<T>(
|
|
|
|
|
|
tasks: (() => Promise<T>)[],
|
|
|
|
|
|
limit: number,
|
|
|
|
|
|
): Promise<T[]> {
|
|
|
|
|
|
const results: T[] = new Array(tasks.length);
|
|
|
|
|
|
let next = 0;
|
|
|
|
|
|
async function worker() {
|
|
|
|
|
|
while (next < tasks.length) {
|
|
|
|
|
|
const i = next++;
|
|
|
|
|
|
results[i] = await tasks[i]();
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, worker));
|
|
|
|
|
|
return results;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-25 08:30:01 +00:00
|
|
|
|
// ── Frame quality pre-filtering ──────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
interface FrameQuality {
|
|
|
|
|
|
valid: boolean;
|
|
|
|
|
|
meanBrightness: number;
|
|
|
|
|
|
variance: number;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async function assessFrameQuality(imagePath: string): Promise<FrameQuality> {
|
|
|
|
|
|
const sharp = (await import('sharp')).default;
|
|
|
|
|
|
const { data, info } = await sharp(imagePath)
|
|
|
|
|
|
.grayscale()
|
|
|
|
|
|
.raw()
|
|
|
|
|
|
.toBuffer({ resolveWithObject: true });
|
|
|
|
|
|
|
|
|
|
|
|
const pixels = new Uint8Array(data);
|
|
|
|
|
|
let sum = 0;
|
|
|
|
|
|
let sumSq = 0;
|
|
|
|
|
|
for (let i = 0; i < pixels.length; i++) {
|
|
|
|
|
|
sum += pixels[i];
|
|
|
|
|
|
sumSq += pixels[i] * pixels[i];
|
|
|
|
|
|
}
|
|
|
|
|
|
const mean = sum / pixels.length;
|
|
|
|
|
|
const variance = sumSq / pixels.length - mean * mean;
|
|
|
|
|
|
|
|
|
|
|
|
// Skip near-black, near-white, or very low variance (blurry/blank/transition)
|
|
|
|
|
|
const valid = mean > 15 && mean < 240 && variance > 50;
|
|
|
|
|
|
return { valid, meanBrightness: mean, variance };
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async function filterQualityFrames(frames: ExtractedFrame[]): Promise<ExtractedFrame[]> {
|
|
|
|
|
|
const results = await Promise.all(
|
|
|
|
|
|
frames.map(async (frame) => {
|
|
|
|
|
|
try {
|
|
|
|
|
|
const q = await assessFrameQuality(frame.imagePath);
|
|
|
|
|
|
return { frame, valid: q.valid };
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
return { frame, valid: true };
|
|
|
|
|
|
}
|
|
|
|
|
|
}),
|
|
|
|
|
|
);
|
|
|
|
|
|
const valid = results.filter(r => r.valid).map(r => r.frame);
|
|
|
|
|
|
return valid.length > 0 ? valid : frames;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function isValidBoundingBox(bbox: [number, number, number, number]): boolean {
|
|
|
|
|
|
const [x1, y1, x2, y2] = bbox;
|
|
|
|
|
|
return (
|
|
|
|
|
|
x1 >= 0 && x1 <= 1 &&
|
|
|
|
|
|
y1 >= 0 && y1 <= 1 &&
|
|
|
|
|
|
x2 >= 0 && x2 <= 1 &&
|
|
|
|
|
|
y2 >= 0 && y2 <= 1 &&
|
|
|
|
|
|
x1 < x2 &&
|
|
|
|
|
|
y1 < y2 &&
|
|
|
|
|
|
(x2 - x1) * (y2 - y1) > 0.005
|
|
|
|
|
|
);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-25 07:13:07 +00:00
|
|
|
|
// Skips Pass 1 filter entirely — ranks all frames and always returns the best one.
|
|
|
|
|
|
// Evenly samples down to maxCandidates when there are too many frames.
|
|
|
|
|
|
export async function detectBestFrame(
|
|
|
|
|
|
frames: ExtractedFrame[],
|
|
|
|
|
|
visionConfig: VisionConfig,
|
|
|
|
|
|
maxCandidates: number = 20,
|
|
|
|
|
|
): Promise<ProductFrame | null> {
|
|
|
|
|
|
if (frames.length === 0) return null;
|
|
|
|
|
|
|
2026-04-25 08:30:01 +00:00
|
|
|
|
// 1. Filter out obviously bad frames (black, white, blurry)
|
|
|
|
|
|
let candidates = await filterQualityFrames(frames);
|
2026-04-25 07:13:07 +00:00
|
|
|
|
|
2026-04-25 08:30:01 +00:00
|
|
|
|
// 2. Sample if too many
|
|
|
|
|
|
if (candidates.length > maxCandidates) {
|
|
|
|
|
|
const step = candidates.length / maxCandidates;
|
|
|
|
|
|
candidates = Array.from({ length: maxCandidates }, (_, i) => candidates[Math.floor(i * step)]);
|
2026-04-25 07:13:07 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-25 08:30:01 +00:00
|
|
|
|
const model = createVisionModel(visionConfig);
|
2026-04-25 07:13:07 +00:00
|
|
|
|
|
2026-04-25 08:30:01 +00:00
|
|
|
|
// 3. Try Vision ranking with error isolation
|
|
|
|
|
|
try {
|
|
|
|
|
|
const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);
|
|
|
|
|
|
|
|
|
|
|
|
if (isValidBoundingBox(boundingBox)) {
|
|
|
|
|
|
const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
|
|
|
|
|
|
try {
|
|
|
|
|
|
await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
// cropping is optional — keep original frame
|
|
|
|
|
|
}
|
|
|
|
|
|
return {
|
|
|
|
|
|
frameIndex: bestFrame.frameIndex,
|
|
|
|
|
|
timestampSeconds: bestFrame.timestampSeconds,
|
|
|
|
|
|
imagePath: bestFrame.imagePath,
|
|
|
|
|
|
...(croppedPath ? { croppedImagePath: croppedPath } : {}),
|
|
|
|
|
|
confidence: 0.95,
|
|
|
|
|
|
description,
|
|
|
|
|
|
boundingHint: reasoning,
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
// Vision ranking failed — fall through to fallback
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 4. Fallback: rank by frame quality (variance) and return the sharpest
|
|
|
|
|
|
const withQuality = await Promise.all(
|
|
|
|
|
|
candidates.map(async (f) => {
|
|
|
|
|
|
try {
|
|
|
|
|
|
const q = await assessFrameQuality(f.imagePath);
|
|
|
|
|
|
return { frame: f, score: q.variance };
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
return { frame: f, score: 0 };
|
|
|
|
|
|
}
|
|
|
|
|
|
}),
|
|
|
|
|
|
);
|
|
|
|
|
|
withQuality.sort((a, b) => b.score - a.score);
|
|
|
|
|
|
const best = withQuality[0].frame;
|
2026-04-25 07:13:07 +00:00
|
|
|
|
|
|
|
|
|
|
return {
|
2026-04-25 08:30:01 +00:00
|
|
|
|
frameIndex: best.frameIndex,
|
|
|
|
|
|
timestampSeconds: best.timestampSeconds,
|
|
|
|
|
|
imagePath: best.imagePath,
|
|
|
|
|
|
confidence: 0.5,
|
|
|
|
|
|
description: 'product frame (auto-selected)',
|
|
|
|
|
|
boundingHint: 'picked by frame quality analysis (Vision ranking failed)',
|
2026-04-25 07:13:07 +00:00
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-19 23:24:28 +00:00
|
|
|
|
export async function detectProductFrames(
|
|
|
|
|
|
frames: ExtractedFrame[],
|
|
|
|
|
|
minConfidence: number,
|
2026-04-21 00:20:37 +00:00
|
|
|
|
concurrency: number = 10,
|
2026-04-20 04:14:43 +00:00
|
|
|
|
visionConfig: VisionConfig,
|
2026-04-19 23:24:28 +00:00
|
|
|
|
): Promise<ProductFrame[]> {
|
2026-04-20 04:14:43 +00:00
|
|
|
|
const model = createVisionModel(visionConfig);
|
2026-04-19 23:24:28 +00:00
|
|
|
|
|
2026-04-21 00:20:37 +00:00
|
|
|
|
// Pass 1: all frames in parallel, bounded by concurrency
|
|
|
|
|
|
const keepFlags = await withConcurrency(
|
|
|
|
|
|
frames.map((f) => () => filterFrame(f, model).catch(() => false)),
|
|
|
|
|
|
concurrency,
|
|
|
|
|
|
);
|
2026-04-19 23:24:28 +00:00
|
|
|
|
|
|
|
|
|
|
const candidates = frames.filter((_, i) => keepFlags[i]);
|
|
|
|
|
|
if (candidates.length === 0) return [];
|
|
|
|
|
|
|
|
|
|
|
|
// Pass 2: single comparative call — model sees all candidates at once
|
2026-04-25 08:30:01 +00:00
|
|
|
|
let bestSnapshot: ProductFrame | undefined;
|
|
|
|
|
|
try {
|
|
|
|
|
|
const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);
|
|
|
|
|
|
|
|
|
|
|
|
if (isValidBoundingBox(boundingBox)) {
|
|
|
|
|
|
const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
|
|
|
|
|
|
try {
|
|
|
|
|
|
await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
|
|
|
|
|
|
} catch {}
|
|
|
|
|
|
bestSnapshot = {
|
|
|
|
|
|
frameIndex: bestFrame.frameIndex,
|
|
|
|
|
|
timestampSeconds: bestFrame.timestampSeconds,
|
|
|
|
|
|
imagePath: bestFrame.imagePath,
|
|
|
|
|
|
...(croppedPath ? { croppedImagePath: croppedPath } : {}),
|
|
|
|
|
|
confidence: 0.95,
|
|
|
|
|
|
description,
|
|
|
|
|
|
boundingHint: reasoning,
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
// ranking failed
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!bestSnapshot) {
|
|
|
|
|
|
return [];
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return [bestSnapshot];
|
2026-04-19 23:24:28 +00:00
|
|
|
|
}
|