video-product-finder/src/product-detector.ts

import { generateObject } from 'ai';
import { createOpenAI } from '@ai-sdk/openai';
import { z } from 'zod';
import type { ExtractedFrame } from './frame-extractor.ts';
import type { ProductFrame } from './types.ts';
import { imageToBase64 } from './frame-extractor.ts';
import type { VisionConfig } from './index.ts';

// Pass 1: quick filter — discard frames that clearly have no product
const FilterSchema = z.object({
  keep: z.boolean(),
  reason: z.enum(['product_visible', 'content_only', 'hands_only', 'blur', 'transition', 'background_only']),
});

// Pass 2: comparative ranking across all candidates
const RankingSchema = z.object({
  bestFrameIndex: z.number().int(),
  description: z.string(),
  reasoning: z.string(),
  // normalized 0-1 relative to image dimensions: [x1, y1, x2, y2]
  boundingBox: z.tuple([z.number(), z.number(), z.number(), z.number()]),
});

const FILTER_PROMPT = `You are filtering frames from a TikTok/Douyin ecommerce product video.

Keep a frame (keep=true) ONLY if the HERO PRODUCT (the main item being sold) is at least partially visible as a recognizable object.
Discard (keep=false) if: only hands/texture/contents visible, motion blur, black/transition frame, or no product at all.

reason options: product_visible | content_only | hands_only | blur | transition | background_only`;

const RANKING_PROMPT = (count: number) => `You are selecting the single best product frame from ${count} video frames for ecommerce search.

Frames are numbered 0 to ${count - 1} in order shown.

IMPORTANT: You MUST pick ONE frame — even if product visibility is imperfect or no frame looks ideal. Always make your best guess.

Pick the frame where the MAIN SELLING PRODUCT is:
1. Most recognizable — clearest view of the item being sold
2. Most complete — full product silhouette visible, not cropped at edges
3. Cleanest — minimal obstruction (hands, clutter, motion blur, labels)
4. Best lit and in focus

Return:
- bestFrameIndex: 0-based index
- description: concise search query under 12 words (product type + material + color + key features), in Chinese
- reasoning: one sentence explaining choice
- boundingBox: tight box of the PRODUCT ONLY as [x1, y1, x2, y2] normalized 0.0–1.0, top-left origin. Exclude hands, background, and unrelated objects. The product is near the center of the frame.`;

function createVisionModel(config: VisionConfig) {
  const provider = createOpenAI({ apiKey: config.apiKey, baseURL: config.baseURL });
  return provider(config.model);
}

async function filterFrame(
  frame: ExtractedFrame,
  model: ReturnType<ReturnType<typeof createOpenAI>>,
): Promise<boolean> {
  const base64Image = imageToBase64(frame.imagePath);

  const { object } = await generateObject({
    model,
    schema: FilterSchema,
    messages: [{
      role: 'user',
      content: [
        { type: 'image', image: `data:image/jpeg;base64,${base64Image}` },
        { type: 'text', text: FILTER_PROMPT },
      ],
    }],
  });

  return object.keep;
}

async function rankCandidates(
  candidates: ExtractedFrame[],
  model: ReturnType<ReturnType<typeof createOpenAI>>,
): Promise<{ bestFrame: ExtractedFrame; description: string; reasoning: string; boundingBox: [number, number, number, number] }> {
  const imageContent = candidates.map((f) => ({
    type: 'image' as const,
    image: `data:image/jpeg;base64,${imageToBase64(f.imagePath)}`,
  }));

  const { object } = await generateObject({
    model,
    schema: RankingSchema,
    mode: 'json',
    messages: [{
      role: 'user',
      content: [
        ...imageContent,
        { type: 'text', text: RANKING_PROMPT(candidates.length) },
      ],
    }],
  });

  const idx = Math.max(0, Math.min(object.bestFrameIndex, candidates.length - 1));
  return {
    bestFrame: candidates[idx],
    description: object.description,
    reasoning: object.reasoning,
    boundingBox: object.boundingBox,
  };
}

export async function cropProduct(
  imagePath: string,
  boundingBox: [number, number, number, number],
  outputPath: string,
  paddingFactor = 0.05,
): Promise<string> {
  const sharp = (await import('sharp')).default;
  const meta = await sharp(imagePath).metadata();
  const W = meta.width!;
  const H = meta.height!;

  let [x1, y1, x2, y2] = boundingBox;

  // Normalize coords: ensure x1<x2 and y1<y2
  if (x1 > x2) [x1, x2] = [x2, x1];
  if (y1 > y2) [y1, y2] = [y2, y1];

  // Clamp to [0, 1]
  x1 = Math.max(0, Math.min(1, x1));
  y1 = Math.max(0, Math.min(1, y1));
  x2 = Math.max(0, Math.min(1, x2));
  y2 = Math.max(0, Math.min(1, y2));

  // Add padding
  const pw = (x2 - x1) * paddingFactor;
  const ph = (y2 - y1) * paddingFactor;
  x1 = Math.max(0, x1 - pw);
  y1 = Math.max(0, y1 - ph);
  x2 = Math.min(1, x2 + pw);
  y2 = Math.min(1, y2 + ph);

  // Validate minimum area
  if (x2 - x1 < 0.005 || y2 - y1 < 0.005) {
    throw new Error('bounding box too small after normalization');
  }

  const left = Math.round(x1 * W);
  const top = Math.round(y1 * H);
  const width = Math.round((x2 - x1) * W);
  const height = Math.round((y2 - y1) * H);

  await sharp(imagePath)
    .extract({ left, top, width, height })
    .jpeg({ quality: 95 })
    .toFile(outputPath);

  return outputPath;
}

async function withConcurrency<T>(
  tasks: (() => Promise<T>)[],
  limit: number,
): Promise<T[]> {
  const results: T[] = new Array(tasks.length);
  let next = 0;
  async function worker() {
    while (next < tasks.length) {
      const i = next++;
      results[i] = await tasks[i]();
    }
  }
  await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, worker));
  return results;
}

// ── Frame quality pre-filtering ──────────────────────────────────────

interface FrameQuality {
  valid: boolean;
  meanBrightness: number;
  variance: number;
}

async function assessFrameQuality(imagePath: string): Promise<FrameQuality> {
  const sharp = (await import('sharp')).default;
  const { data, info } = await sharp(imagePath)
    .grayscale()
    .raw()
    .toBuffer({ resolveWithObject: true });

  const pixels = new Uint8Array(data);
  let sum = 0;
  let sumSq = 0;
  for (let i = 0; i < pixels.length; i++) {
    sum += pixels[i];
    sumSq += pixels[i] * pixels[i];
  }
  const mean = sum / pixels.length;
  const variance = sumSq / pixels.length - mean * mean;

  // Skip near-black, near-white, or very low variance (blurry/blank/transition)
  const valid = mean > 15 && mean < 240 && variance > 50;
  return { valid, meanBrightness: mean, variance };
}

async function filterQualityFrames(frames: ExtractedFrame[]): Promise<ExtractedFrame[]> {
  const results = await Promise.all(
    frames.map(async (frame) => {
      try {
        const q = await assessFrameQuality(frame.imagePath);
        return { frame, valid: q.valid };
      } catch {
        return { frame, valid: true };
      }
    }),
  );
  const valid = results.filter(r => r.valid).map(r => r.frame);
  return valid.length > 0 ? valid : frames;
}

function isValidBoundingBox(bbox: [number, number, number, number]): boolean {
  const [x1, y1, x2, y2] = bbox;
  return (
    x1 >= 0 && x1 <= 1 &&
    y1 >= 0 && y1 <= 1 &&
    x2 >= 0 && x2 <= 1 &&
    y2 >= 0 && y2 <= 1 &&
    x1 < x2 &&
    y1 < y2 &&
    (x2 - x1) * (y2 - y1) > 0.005
  );
}

// Skips Pass 1 filter entirely — ranks all frames and always returns the best one.
// Evenly samples down to maxCandidates when there are too many frames.
export async function detectBestFrame(
  frames: ExtractedFrame[],
  visionConfig: VisionConfig,
  maxCandidates: number = 20,
): Promise<ProductFrame | null> {
  if (frames.length === 0) return null;

  // 1. Filter out obviously bad frames (black, white, blurry)
  let candidates = await filterQualityFrames(frames);

  // 2. Sample if too many
  if (candidates.length > maxCandidates) {
    const step = candidates.length / maxCandidates;
    candidates = Array.from({ length: maxCandidates }, (_, i) => candidates[Math.floor(i * step)]);
  }

  const model = createVisionModel(visionConfig);

  // 3. Try Vision ranking with error isolation
  try {
    const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);

    if (isValidBoundingBox(boundingBox)) {
      const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
      try {
        await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
      } catch {
        // cropping is optional — keep original frame
      }
      return {
        frameIndex: bestFrame.frameIndex,
        timestampSeconds: bestFrame.timestampSeconds,
        imagePath: bestFrame.imagePath,
        ...(croppedPath ? { croppedImagePath: croppedPath } : {}),
        confidence: 0.95,
        description,
        boundingHint: reasoning,
      };
    }
  } catch {
    // Vision ranking failed — fall through to fallback
  }

  // 4. Fallback: rank by frame quality (variance) and return the sharpest
  const withQuality = await Promise.all(
    candidates.map(async (f) => {
      try {
        const q = await assessFrameQuality(f.imagePath);
        return { frame: f, score: q.variance };
      } catch {
        return { frame: f, score: 0 };
      }
    }),
  );
  withQuality.sort((a, b) => b.score - a.score);
  const best = withQuality[0].frame;

  return {
    frameIndex: best.frameIndex,
    timestampSeconds: best.timestampSeconds,
    imagePath: best.imagePath,
    confidence: 0.5,
    description: 'product frame (auto-selected)',
    boundingHint: 'picked by frame quality analysis (Vision ranking failed)',
  };
}

export async function detectProductFrames(
  frames: ExtractedFrame[],
  minConfidence: number,
  concurrency: number = 10,
  visionConfig: VisionConfig,
): Promise<ProductFrame[]> {
  const model = createVisionModel(visionConfig);

  // Pass 1: all frames in parallel, bounded by concurrency
  const keepFlags = await withConcurrency(
    frames.map((f) => () => filterFrame(f, model).catch(() => false)),
    concurrency,
  );

  const candidates = frames.filter((_, i) => keepFlags[i]);
  if (candidates.length === 0) return [];

  // Pass 2: single comparative call — model sees all candidates at once
  let bestSnapshot: ProductFrame | undefined;
  try {
    const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);

    if (isValidBoundingBox(boundingBox)) {
      const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
      try {
        await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
      } catch {}
      bestSnapshot = {
        frameIndex: bestFrame.frameIndex,
        timestampSeconds: bestFrame.timestampSeconds,
        imagePath: bestFrame.imagePath,
        ...(croppedPath ? { croppedImagePath: croppedPath } : {}),
        confidence: 0.95,
        description,
        boundingHint: reasoning,
      };
    }
  } catch {
    // ranking failed
  }

  if (!bestSnapshot) {
    return [];
  }

  return [bestSnapshot];
}