feat: add detect-video command using direct video upload + API analysis

- New detect-video / detect-video-and-search commands: upload video to get public URL, analyze via LiteLLM (video_url), generate keyword, search 1688 - New src/video-analyzer.ts: upload via direct HTTP (bypasses auth-rt CLI arg length limit), analyze via Chat Completions with video_url content - Frame-based pipeline robustness: quality pre-filtering (skip black/blurry frames), bounding box normalization/validation, crop failure tolerance, Vision ranking fallback to sharpness-based selection - Improve ranking prompt: force pick one frame, Chinese description - Update docs to recommend detect-video-and-search as primary command Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-25 16:30:01 +08:00 · 2026-04-25 16:30:01 +08:00 · db4735e54e
parent 91a623751d
commit db4735e54e
6 changed files with 371 additions and 49 deletions
--- a/SKILL.md
+++ b/SKILL.md
@ -17,12 +17,30 @@ bun dist/run.js <command> [args] [--dry-run]
 | 命令 | 使用场景 |
 |------|---------|
-| `detect-best-and-search <video>` | **视频输入的默认命令。** 始终找出最佳画面（不管置信度高低），然后搜图。 |
+| `detect-video-and-search <video>` | **推荐。** 直接上传视频到 API 识别商品主体，然后 1688 关键词搜索。跳过本地抽帧，无需 Vision API。 |
-| `detect-best <video>` | 只提取最佳画面，不搜图。 |
+| `detect-best-and-search <video>` | 旧版。抽帧 + Vision 排名 + 搜图。需要 Vision API key。 |
 | `detect-video <video>` | 只识别商品描述和生成关键词，不搜图。 |
 | `detect-best <video>` | 旧版。只提取最佳画面，不搜图。 |
 | `search <image-path>` | 已经有商品截图了，跳过检测直接搜图。 |
-| `detect-and-search <video>` | 旧版。过滤可能太严格导致无结果。建议用 `detect-best-and-search`。 |
+| `detect-and-search <video>` | 旧版。**不推荐。** |
 | `session` | 获取当前认证会话 token。 |
 ## `detect-video` / `detect-video-and-search`
 上传视频到 API 直接识别商品主体，不走本地抽帧。
 流程：
 1. 上传视频 → 获取公开 URL（复用现有上传接口）
 2. 调用 LiteLLM（Chat Completions + `video_url`）分析视频内容
 3. 识别商品名称、材质、颜色、功能
 4. 生成中文搜索关键词
 5. 1688 关键词搜索（`detect-video-and-search`）
 依赖：
 - `auth-rt` client key（自动，无需额外配置）
 - LiteLLM 代理支持 `video_url` 内容类型
 - 上传接口返回公开 URL
 ## `detect-best` / `detect-best-and-search` 选项
 | 参数 | 默认值 | 说明 |
@ -75,7 +93,7 @@ CLI 执行完成后，将 `rerank.results` 格式化为 markdown 表格，**每
 ### 视频命令（慢 — 用 sub-agent 执行）
-涉及命令：`detect-best-and-search`、`detect-best`、`detect-and-search`
+涉及命令：`detect-video-and-search`、`detect-best-and-search`、`detect-best`、`detect-and-search`、`detect-video`
 使用 `sessions_spawn` 创建 sub-agent 执行，**不要直接运行**。
@ -96,6 +114,6 @@ sessions_spawn(
 ### 通用规则
-1. **视频输入 → 始终用 `detect-best-and-search`。** 不要用 `detect-and-search`。
+1. **视频输入 → 优先用 `detect-video-and-search`。** 比抽帧方案更可靠。如果没配视频模型，降级到 `detect-best-and-search`。不要用 `detect-and-search`。
 2. **不要重试。** 命令失败就直接报错。
 3. **信任工具输出。** CLI 内部已处理 session 管理和错误格式化。
--- a/scripts/run.ts
+++ b/scripts/run.ts
@ -43,6 +43,12 @@ function printUsage(): void {
  detect-and-search <video-path> [options]
      检测最佳商品画面 → 图片搜索 → 关键词重排序
  detect-video <video-path>
      直接上传视频到 API 识别商品主体，输出商品描述和搜索关键词
  detect-video-and-search <video-path>
      上传视频识别商品 → 1688 关键词搜索 → 重排序
  rerank --image-results=<json> [--description=<text>] [--keyword=<text>] [--top=<n>]
      通过关键词交并集过滤搜索结果
--- a/src/index.ts
+++ b/src/index.ts
@ -1,10 +1,11 @@
 import * as fs from 'fs';
 import * as path from 'path';
-import type { Command, DetectOptions, DetectResult, SearchResult, OutputResult, SearchItem } from './types.ts';
+import type { Command, DetectOptions, DetectResult, SearchResult, OutputResult, SearchItem, DetectVideoResult } from './types.ts';
 import { createSkillClient } from './auth-cli.ts';
 import { extractFrames } from './frame-extractor.ts';
 import { detectProductFrames, detectBestFrame } from './product-detector.ts';
 import { imageToBase64 } from './frame-extractor.ts';
 import { uploadVideo, analyzeVideo } from './video-analyzer.ts';
 import { generateText } from 'ai';
 import { createOpenAI } from '@ai-sdk/openai';
@ -43,6 +44,10 @@ export async function run(
      return runDetectBest(args, dryRun);
    case 'detect-best-and-search':
      return runDetectBestAndSearch(args, dryRun);
    case 'detect-video':
      return runDetectVideo(args, dryRun);
    case 'detect-video-and-search':
      return runDetectVideoAndSearch(args, dryRun);
    case 'rerank':
      return runRerank(args, dryRun);
    default:
@ -153,7 +158,7 @@ async function runDetectBest(args: string[], dryRun: boolean): Promise<DetectRes
    return { status: 'failed', command: 'detect-best', dryRun, videoPath, error: 'no frames extracted from video' };
  }
-  const best = await detectBestFrame(frames, 10, visionConfig);
+  const best = await detectBestFrame(frames, visionConfig, 20);
  return {
    status: 'success',
@ -245,6 +250,65 @@ async function runDetectAndSearch(args: string[], dryRun: boolean): Promise<Outp
  } as any;
 }
 async function runDetectVideo(args: string[], dryRun: boolean): Promise<DetectVideoResult> {
  const videoPath = args[0];
  if (!videoPath) return { status: 'failed', command: 'detect-video', dryRun, error: 'detect-video requires <video-path>' };
  if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect-video', dryRun, error: `video not found: ${videoPath}` };
  if (dryRun) {
    return { status: 'success', command: 'detect-video', dryRun, videoPath };
  }
  const client = createSkillClient();
  const visionConfig = await loadVisionConfig(client);
  // 1. Upload video to get public URL
  const videoUrl = await uploadVideo(videoPath);
  // 2. Analyze video via LLM
  const { description } = await analyzeVideo(videoUrl, visionConfig);
  // 3. Generate Chinese search keyword
  const keyword = await generateChineseKeyword(description, visionConfig);
  return {
    status: 'success',
    command: 'detect-video',
    dryRun,
    videoPath,
    videoUrl,
    description,
    keyword,
  };
 }
 async function runDetectVideoAndSearch(args: string[], dryRun: boolean): Promise<DetectVideoResult> {
  const result = await runDetectVideo(args, dryRun) as DetectVideoResult;
  if (result.status === 'failed') return result;
  if (dryRun) return { ...result, command: 'detect-video-and-search' };
  const client = createSkillClient();
  // Search 1688 with keyword directly (no rerank — image-based rerank doesn't apply to text search)
  let searchResults: SearchItem[] = [];
  if (result.keyword) {
    try {
      const items = await keywordSearch(client, result.keyword);
      // Sort by sales descending
      searchResults = items.sort((a, b) => (b.sales ?? 0) - (a.sales ?? 0));
    } catch (e: any) {
      return { ...result, command: 'detect-video-and-search', status: 'failed', error: `keyword search failed: ${e.message}` };
    }
  }
  return {
    ...result,
    command: 'detect-video-and-search',
    searchResults,
  };
 }
 function parseDetectOptions(videoPath: string, args: string[]): DetectOptions {
  const outputDir = getFlag(args, '--output-dir') || path.join(
    path.dirname(videoPath),
--- a/src/product-detector.ts
+++ b/src/product-detector.ts
@ -28,21 +28,23 @@ Discard (keep=false) if: only hands/texture/contents visible, motion blur, black
 reason options: product_visible | content_only | hands_only | blur | transition | background_only`;
-const RANKING_PROMPT = (count: number) => `You are selecting the single best product image from ${count} video frames for ecommerce image search.
+const RANKING_PROMPT = (count: number) => `You are selecting the single best product frame from ${count} video frames for ecommerce search.
-The frames are numbered 0 to ${count - 1} in the order shown.
+Frames are numbered 0 to ${count - 1} in order shown.
-Pick the ONE frame where the HERO PRODUCT is:
+IMPORTANT: You MUST pick ONE frame — even if product visibility is imperfect or no frame looks ideal. Always make your best guess.
-1. Cleanest — fewest distractions, no hands blocking it, no clutter in foreground
+
-2. Most complete — full product silhouette visible, no edges cropped
+Pick the frame where the MAIN SELLING PRODUCT is:
-3. Most isolated — product stands out from background clearly
+1. Most recognizable — clearest view of the item being sold
-4. Empty/minimal load preferred — a product without contents (e.g. an empty rack) beats one stuffed with items if both show the full structure equally
+2. Most complete — full product silhouette visible, not cropped at edges
 3. Cleanest — minimal obstruction (hands, clutter, motion blur, labels)
 4. Best lit and in focus
 Return:
- bestFrameIndex: 0-based index of chosen frame
+- bestFrameIndex: 0-based index
- description: concise search query under 12 words (product type + material + color + key feature)
+- description: concise search query under 12 words (product type + material + color + key features), in Chinese
- reasoning: one sentence explaining why this frame was chosen
+- reasoning: one sentence explaining choice
- boundingBox: tight bounding box of the HERO PRODUCT ONLY in the chosen frame as [x1, y1, x2, y2] normalized 0.0–1.0 (top-left origin). Exclude hands, background, and unrelated objects. The product is assumed to be near the center.`;
+- boundingBox: tight box of the PRODUCT ONLY as [x1, y1, x2, y2] normalized 0.0–1.0, top-left origin. Exclude hands, background, and unrelated objects. The product is near the center of the frame.`;
 function createVisionModel(config: VisionConfig) {
  const provider = createOpenAI({ apiKey: config.apiKey, baseURL: config.baseURL });
@ -114,7 +116,17 @@ export async function cropProduct(
  let [x1, y1, x2, y2] = boundingBox;
-  // add padding
+  // Normalize coords: ensure x1<x2 and y1<y2
  if (x1 > x2) [x1, x2] = [x2, x1];
  if (y1 > y2) [y1, y2] = [y2, y1];
  // Clamp to [0, 1]
  x1 = Math.max(0, Math.min(1, x1));
  y1 = Math.max(0, Math.min(1, y1));
  x2 = Math.max(0, Math.min(1, x2));
  y2 = Math.max(0, Math.min(1, y2));
  // Add padding
  const pw = (x2 - x1) * paddingFactor;
  const ph = (y2 - y1) * paddingFactor;
  x1 = Math.max(0, x1 - pw);
@ -122,6 +134,11 @@ export async function cropProduct(
  x2 = Math.min(1, x2 + pw);
  y2 = Math.min(1, y2 + ph);
  // Validate minimum area
  if (x2 - x1 < 0.005 || y2 - y1 < 0.005) {
    throw new Error('bounding box too small after normalization');
  }
  const left = Math.round(x1 * W);
  const top = Math.round(y1 * H);
  const width = Math.round((x2 - x1) * W);
@ -151,38 +168,131 @@ async function withConcurrency<T>(
  return results;
 }
 // ── Frame quality pre-filtering ──────────────────────────────────────
 interface FrameQuality {
  valid: boolean;
  meanBrightness: number;
  variance: number;
 }
 async function assessFrameQuality(imagePath: string): Promise<FrameQuality> {
  const sharp = (await import('sharp')).default;
  const { data, info } = await sharp(imagePath)
    .grayscale()
    .raw()
    .toBuffer({ resolveWithObject: true });
  const pixels = new Uint8Array(data);
  let sum = 0;
  let sumSq = 0;
  for (let i = 0; i < pixels.length; i++) {
    sum += pixels[i];
    sumSq += pixels[i] * pixels[i];
  }
  const mean = sum / pixels.length;
  const variance = sumSq / pixels.length - mean * mean;
  // Skip near-black, near-white, or very low variance (blurry/blank/transition)
  const valid = mean > 15 && mean < 240 && variance > 50;
  return { valid, meanBrightness: mean, variance };
 }
 async function filterQualityFrames(frames: ExtractedFrame[]): Promise<ExtractedFrame[]> {
  const results = await Promise.all(
    frames.map(async (frame) => {
      try {
        const q = await assessFrameQuality(frame.imagePath);
        return { frame, valid: q.valid };
      } catch {
        return { frame, valid: true };
      }
    }),
  );
  const valid = results.filter(r => r.valid).map(r => r.frame);
  return valid.length > 0 ? valid : frames;
 }
 function isValidBoundingBox(bbox: [number, number, number, number]): boolean {
  const [x1, y1, x2, y2] = bbox;
  return (
    x1 >= 0 && x1 <= 1 &&
    y1 >= 0 && y1 <= 1 &&
    x2 >= 0 && x2 <= 1 &&
    y2 >= 0 && y2 <= 1 &&
    x1 < x2 &&
    y1 < y2 &&
    (x2 - x1) * (y2 - y1) > 0.005
  );
 }
 // Skips Pass 1 filter entirely — ranks all frames and always returns the best one.
 // Evenly samples down to maxCandidates when there are too many frames.
 export async function detectBestFrame(
  frames: ExtractedFrame[],
  concurrency: number = 10,
  visionConfig: VisionConfig,
  maxCandidates: number = 20,
 ): Promise<ProductFrame | null> {
  if (frames.length === 0) return null;
-  const model = createVisionModel(visionConfig);
+  // 1. Filter out obviously bad frames (black, white, blurry)
  let candidates = await filterQualityFrames(frames);
-  let candidates = frames;
+  // 2. Sample if too many
-  if (frames.length > maxCandidates) {
+  if (candidates.length > maxCandidates) {
-    const step = frames.length / maxCandidates;
+    const step = candidates.length / maxCandidates;
-    candidates = Array.from({ length: maxCandidates }, (_, i) => frames[Math.floor(i * step)]);
+    candidates = Array.from({ length: maxCandidates }, (_, i) => candidates[Math.floor(i * step)]);
  }
  const model = createVisionModel(visionConfig);
  // 3. Try Vision ranking with error isolation
  try {
    const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);
    if (isValidBoundingBox(boundingBox)) {
      const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
      try {
        await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
-
+      } catch {
        // cropping is optional — keep original frame
      }
      return {
        frameIndex: bestFrame.frameIndex,
        timestampSeconds: bestFrame.timestampSeconds,
        imagePath: bestFrame.imagePath,
-    croppedImagePath: croppedPath,
+        ...(croppedPath ? { croppedImagePath: croppedPath } : {}),
        confidence: 0.95,
        description,
        boundingHint: reasoning,
      };
    }
  } catch {
    // Vision ranking failed — fall through to fallback
  }
  // 4. Fallback: rank by frame quality (variance) and return the sharpest
  const withQuality = await Promise.all(
    candidates.map(async (f) => {
      try {
        const q = await assessFrameQuality(f.imagePath);
        return { frame: f, score: q.variance };
      } catch {
        return { frame: f, score: 0 };
      }
    }),
  );
  withQuality.sort((a, b) => b.score - a.score);
  const best = withQuality[0].frame;
  return {
    frameIndex: best.frameIndex,
    timestampSeconds: best.timestampSeconds,
    imagePath: best.imagePath,
    confidence: 0.5,
    description: 'product frame (auto-selected)',
    boundingHint: 'picked by frame quality analysis (Vision ranking failed)',
  };
 }
 export async function detectProductFrames(
@ -203,18 +313,32 @@ export async function detectProductFrames(
  if (candidates.length === 0) return [];
  // Pass 2: single comparative call — model sees all candidates at once
  let bestSnapshot: ProductFrame | undefined;
  try {
    const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);
    if (isValidBoundingBox(boundingBox)) {
      const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
      try {
        await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
-
+      } catch {}
-  return [{
+      bestSnapshot = {
        frameIndex: bestFrame.frameIndex,
        timestampSeconds: bestFrame.timestampSeconds,
        imagePath: bestFrame.imagePath,
-    croppedImagePath: croppedPath,
+        ...(croppedPath ? { croppedImagePath: croppedPath } : {}),
        confidence: 0.95,
        description,
        boundingHint: reasoning,
-  }];
+      };
    }
  } catch {
    // ranking failed
  }
  if (!bestSnapshot) {
    return [];
  }
  return [bestSnapshot];
 }
--- a/src/types.ts
+++ b/src/types.ts
@ -1,4 +1,4 @@
-export type Command = 'detect' | 'search' | 'detect-and-search' | 'detect-best' | 'detect-best-and-search' | 'rerank' | 'session';
+export type Command = 'detect' | 'search' | 'detect-and-search' | 'detect-best' | 'detect-best-and-search' | 'detect-video' | 'detect-video-and-search' | 'rerank' | 'session';
 export interface SearchItem {
  num_iid: number;
@ -51,4 +51,17 @@ export interface SearchResult {
  error?: string;
 }
-export type OutputResult = DetectResult | SearchResult;
+export interface DetectVideoResult {
  status: 'success' | 'failed';
  command: Command;
  dryRun: boolean;
  videoPath?: string;
  videoUrl?: string;
  description?: string;
  keyword?: string;
  searchResults?: SearchItem[];
  rerank?: unknown;
  error?: string;
 }
 export type OutputResult = DetectResult | SearchResult | DetectVideoResult;
--- a/src/video-analyzer.ts
+++ b/src/video-analyzer.ts
@ -0,0 +1,97 @@
 import * as fs from 'fs';
 import type { VisionConfig } from './index.ts';
 import { createSkillClient } from './auth-cli.ts';
 const UPLOAD_ENDPOINT =
  process.env.ONEBOUND_UPLOAD_ENDPOINT ||
  'http://localhost:3202/api/v1/tasks/upload-image';
 /**
 * Upload a video file to get a public URL.
 *
 * Uses direct HTTP fetch (not auth-rt CLI) to avoid E2BIG errors
 * when the base64-encoded video exceeds the command-line argument limit.
 */
 export async function uploadVideo(videoPath: string): Promise<string> {
  const client = createSkillClient();
  const { accessToken } = await client.session();
  const videoBuffer = fs.readFileSync(videoPath);
  const ext = videoPath.match(/\.(\w+)$/)?.[1] || 'mp4';
  const filename = `video-${Date.now()}.${ext}`;
  const contentType = ext === 'mov' ? 'video/quicktime' : `video/${ext}`;
  const response = await fetch(UPLOAD_ENDPOINT, {
    method: 'POST',
    headers: {
      Authorization: `Bearer ${accessToken}`,
      'Content-Type': 'application/json',
    },
    body: JSON.stringify({
      data: videoBuffer.toString('base64'),
      filename,
      contentType,
    }),
  });
  if (!response.ok) {
    const errBody = await response.text().catch(() => 'unknown');
    throw new Error(`Video upload failed (${response.status}): ${errBody.slice(0, 300)}`);
  }
  const json = (await response.json()) as { url?: string };
  if (!json.url) throw new Error('Upload response missing url');
  return json.url;
 }
 export interface VideoAnalysis {
  description: string;
  rawResponse?: string;
 }
 export async function analyzeVideo(
  videoUrl: string,
  config: VisionConfig,
 ): Promise<VideoAnalysis> {
  const response = await fetch(`${config.baseURL}/v1/chat/completions`, {
    method: 'POST',
    headers: {
      Authorization: `Bearer ${config.apiKey}`,
      'Content-Type': 'application/json',
    },
    body: JSON.stringify({
      model: config.model,
      messages: [
        {
          role: 'user',
          content: [
            {
              type: 'video_url',
              video_url: { url: videoUrl },
            },
            {
              type: 'text',
              text: '找出视频中的商品主体，用中文简要描述商品名称、材质、颜色、功能。',
            },
          ],
        },
      ],
      max_tokens: 500,
    }),
  });
  if (!response.ok) {
    const errBody = await response.text().catch(() => 'unknown');
    throw new Error(
      `Video analysis API error (${response.status}): ${errBody.slice(0, 500)}`,
    );
  }
  const json = (await response.json()) as any;
  const content = json?.choices?.[0]?.message?.content;
  if (!content) {
    throw new Error('Video analysis returned empty response');
  }
  return { description: content.trim(), rawResponse: JSON.stringify(json) };
 }