feat: add detect-video command using direct video upload + API analysis

- New detect-video / detect-video-and-search commands: upload video to get public URL, analyze via LiteLLM (video_url), generate keyword, search 1688 - New src/video-analyzer.ts: upload via direct HTTP (bypasses auth-rt CLI arg length limit), analyze via Chat Completions with video_url content - Frame-based pipeline robustness: quality pre-filtering (skip black/blurry frames), bounding box normalization/validation, crop failure tolerance, Vision ranking fallback to sharpness-based selection - Improve ranking prompt: force pick one frame, Chinese description - Update docs to recommend detect-video-and-search as primary command Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-25 16:30:01 +08:00 · 2026-04-25 16:30:01 +08:00 · db4735e54e
parent 91a623751d
commit db4735e54e
6 changed files with 371 additions and 49 deletions
--- a/SKILL.md
+++ b/SKILL.md
@ -17,12 +17,30 @@ bun dist/run.js <command> [args] [--dry-run]

 | 命令 | 使用场景 |
 |------|---------|
-| `detect-best-and-search <video>` | **视频输入的默认命令。** 始终找出最佳画面（不管置信度高低），然后搜图。 |
-| `detect-best <video>` | 只提取最佳画面，不搜图。 |
+| `detect-video-and-search <video>` | **推荐。** 直接上传视频到 API 识别商品主体，然后 1688 关键词搜索。跳过本地抽帧，无需 Vision API。 |
+| `detect-best-and-search <video>` | 旧版。抽帧 + Vision 排名 + 搜图。需要 Vision API key。 |
+| `detect-video <video>` | 只识别商品描述和生成关键词，不搜图。 |
+| `detect-best <video>` | 旧版。只提取最佳画面，不搜图。 |
 | `search <image-path>` | 已经有商品截图了，跳过检测直接搜图。 |
-| `detect-and-search <video>` | 旧版。过滤可能太严格导致无结果。建议用 `detect-best-and-search`。 |
+| `detect-and-search <video>` | 旧版。**不推荐。** |
 | `session` | 获取当前认证会话 token。 |

+## `detect-video` / `detect-video-and-search`
+
+上传视频到 API 直接识别商品主体，不走本地抽帧。
+
+流程：
+1. 上传视频 → 获取公开 URL（复用现有上传接口）
+2. 调用 LiteLLM（Chat Completions + `video_url`）分析视频内容
+3. 识别商品名称、材质、颜色、功能
+4. 生成中文搜索关键词
+5. 1688 关键词搜索（`detect-video-and-search`）
+
+依赖：
+- `auth-rt` client key（自动，无需额外配置）
+- LiteLLM 代理支持 `video_url` 内容类型
+- 上传接口返回公开 URL
+
 ## `detect-best` / `detect-best-and-search` 选项

 | 参数 | 默认值 | 说明 |
@ -75,7 +93,7 @@ CLI 执行完成后，将 `rerank.results` 格式化为 markdown 表格，**每

 ### 视频命令（慢 — 用 sub-agent 执行）

-涉及命令：`detect-best-and-search`、`detect-best`、`detect-and-search`
+涉及命令：`detect-video-and-search`、`detect-best-and-search`、`detect-best`、`detect-and-search`、`detect-video`

 使用 `sessions_spawn` 创建 sub-agent 执行，**不要直接运行**。

@ -96,6 +114,6 @@ sessions_spawn(

 ### 通用规则

-1. **视频输入 → 始终用 `detect-best-and-search`。** 不要用 `detect-and-search`。
+1. **视频输入 → 优先用 `detect-video-and-search`。** 比抽帧方案更可靠。如果没配视频模型，降级到 `detect-best-and-search`。不要用 `detect-and-search`。
 2. **不要重试。** 命令失败就直接报错。
 3. **信任工具输出。** CLI 内部已处理 session 管理和错误格式化。
--- a/scripts/run.ts
+++ b/scripts/run.ts
@ -43,6 +43,12 @@ function printUsage(): void {
  detect-and-search <video-path> [options]
      检测最佳商品画面 → 图片搜索 → 关键词重排序

+  detect-video <video-path>
+      直接上传视频到 API 识别商品主体，输出商品描述和搜索关键词
+
+  detect-video-and-search <video-path>
+      上传视频识别商品 → 1688 关键词搜索 → 重排序
+
  rerank --image-results=<json> [--description=<text>] [--keyword=<text>] [--top=<n>]
      通过关键词交并集过滤搜索结果

--- a/src/index.ts
+++ b/src/index.ts
@ -1,10 +1,11 @@
 import * as fs from 'fs';
 import * as path from 'path';
-import type { Command, DetectOptions, DetectResult, SearchResult, OutputResult, SearchItem } from './types.ts';
+import type { Command, DetectOptions, DetectResult, SearchResult, OutputResult, SearchItem, DetectVideoResult } from './types.ts';
 import { createSkillClient } from './auth-cli.ts';
 import { extractFrames } from './frame-extractor.ts';
 import { detectProductFrames, detectBestFrame } from './product-detector.ts';
 import { imageToBase64 } from './frame-extractor.ts';
+import { uploadVideo, analyzeVideo } from './video-analyzer.ts';
 import { generateText } from 'ai';
 import { createOpenAI } from '@ai-sdk/openai';

@ -43,6 +44,10 @@ export async function run(
      return runDetectBest(args, dryRun);
    case 'detect-best-and-search':
      return runDetectBestAndSearch(args, dryRun);
+    case 'detect-video':
+      return runDetectVideo(args, dryRun);
+    case 'detect-video-and-search':
+      return runDetectVideoAndSearch(args, dryRun);
    case 'rerank':
      return runRerank(args, dryRun);
    default:
@ -153,7 +158,7 @@ async function runDetectBest(args: string[], dryRun: boolean): Promise<DetectRes
    return { status: 'failed', command: 'detect-best', dryRun, videoPath, error: 'no frames extracted from video' };
  }

-  const best = await detectBestFrame(frames, 10, visionConfig);
+  const best = await detectBestFrame(frames, visionConfig, 20);

  return {
    status: 'success',
@ -245,6 +250,65 @@ async function runDetectAndSearch(args: string[], dryRun: boolean): Promise<Outp
  } as any;
 }

+async function runDetectVideo(args: string[], dryRun: boolean): Promise<DetectVideoResult> {
+  const videoPath = args[0];
+  if (!videoPath) return { status: 'failed', command: 'detect-video', dryRun, error: 'detect-video requires <video-path>' };
+  if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect-video', dryRun, error: `video not found: ${videoPath}` };
+
+  if (dryRun) {
+    return { status: 'success', command: 'detect-video', dryRun, videoPath };
+  }
+
+  const client = createSkillClient();
+  const visionConfig = await loadVisionConfig(client);
+
+  // 1. Upload video to get public URL
+  const videoUrl = await uploadVideo(videoPath);
+
+  // 2. Analyze video via LLM
+  const { description } = await analyzeVideo(videoUrl, visionConfig);
+
+  // 3. Generate Chinese search keyword
+  const keyword = await generateChineseKeyword(description, visionConfig);
+
+  return {
+    status: 'success',
+    command: 'detect-video',
+    dryRun,
+    videoPath,
+    videoUrl,
+    description,
+    keyword,
+  };
+}
+
+async function runDetectVideoAndSearch(args: string[], dryRun: boolean): Promise<DetectVideoResult> {
+  const result = await runDetectVideo(args, dryRun) as DetectVideoResult;
+  if (result.status === 'failed') return result;
+
+  if (dryRun) return { ...result, command: 'detect-video-and-search' };
+
+  const client = createSkillClient();
+
+  // Search 1688 with keyword directly (no rerank — image-based rerank doesn't apply to text search)
+  let searchResults: SearchItem[] = [];
+  if (result.keyword) {
+    try {
+      const items = await keywordSearch(client, result.keyword);
+      // Sort by sales descending
+      searchResults = items.sort((a, b) => (b.sales ?? 0) - (a.sales ?? 0));
+    } catch (e: any) {
+      return { ...result, command: 'detect-video-and-search', status: 'failed', error: `keyword search failed: ${e.message}` };
+    }
+  }
+
+  return {
+    ...result,
+    command: 'detect-video-and-search',
+    searchResults,
+  };
+}
+
 function parseDetectOptions(videoPath: string, args: string[]): DetectOptions {
  const outputDir = getFlag(args, '--output-dir') || path.join(
    path.dirname(videoPath),
--- a/src/product-detector.ts
+++ b/src/product-detector.ts
@ -28,21 +28,23 @@ Discard (keep=false) if: only hands/texture/contents visible, motion blur, black

 reason options: product_visible | content_only | hands_only | blur | transition | background_only`;

-const RANKING_PROMPT = (count: number) => `You are selecting the single best product image from ${count} video frames for ecommerce image search.
+const RANKING_PROMPT = (count: number) => `You are selecting the single best product frame from ${count} video frames for ecommerce search.

-The frames are numbered 0 to ${count - 1} in the order shown.
+Frames are numbered 0 to ${count - 1} in order shown.

-Pick the ONE frame where the HERO PRODUCT is:
-1. Cleanest — fewest distractions, no hands blocking it, no clutter in foreground
-2. Most complete — full product silhouette visible, no edges cropped
-3. Most isolated — product stands out from background clearly
-4. Empty/minimal load preferred — a product without contents (e.g. an empty rack) beats one stuffed with items if both show the full structure equally
+IMPORTANT: You MUST pick ONE frame — even if product visibility is imperfect or no frame looks ideal. Always make your best guess.
+
+Pick the frame where the MAIN SELLING PRODUCT is:
+1. Most recognizable — clearest view of the item being sold
+2. Most complete — full product silhouette visible, not cropped at edges
+3. Cleanest — minimal obstruction (hands, clutter, motion blur, labels)
+4. Best lit and in focus

 Return:
- bestFrameIndex: 0-based index of chosen frame
- description: concise search query under 12 words (product type + material + color + key feature)
- reasoning: one sentence explaining why this frame was chosen
- boundingBox: tight bounding box of the HERO PRODUCT ONLY in the chosen frame as [x1, y1, x2, y2] normalized 0.0–1.0 (top-left origin). Exclude hands, background, and unrelated objects. The product is assumed to be near the center.`;
+- bestFrameIndex: 0-based index
+- description: concise search query under 12 words (product type + material + color + key features), in Chinese
+- reasoning: one sentence explaining choice
+- boundingBox: tight box of the PRODUCT ONLY as [x1, y1, x2, y2] normalized 0.0–1.0, top-left origin. Exclude hands, background, and unrelated objects. The product is near the center of the frame.`;

 function createVisionModel(config: VisionConfig) {
  const provider = createOpenAI({ apiKey: config.apiKey, baseURL: config.baseURL });
@ -114,7 +116,17 @@ export async function cropProduct(

  let [x1, y1, x2, y2] = boundingBox;

-  // add padding
+  // Normalize coords: ensure x1<x2 and y1<y2
+  if (x1 > x2) [x1, x2] = [x2, x1];
+  if (y1 > y2) [y1, y2] = [y2, y1];
+
+  // Clamp to [0, 1]
+  x1 = Math.max(0, Math.min(1, x1));
+  y1 = Math.max(0, Math.min(1, y1));
+  x2 = Math.max(0, Math.min(1, x2));
+  y2 = Math.max(0, Math.min(1, y2));
+
+  // Add padding
  const pw = (x2 - x1) * paddingFactor;
  const ph = (y2 - y1) * paddingFactor;
  x1 = Math.max(0, x1 - pw);
@ -122,6 +134,11 @@ export async function cropProduct(
  x2 = Math.min(1, x2 + pw);
  y2 = Math.min(1, y2 + ph);

+  // Validate minimum area
+  if (x2 - x1 < 0.005 || y2 - y1 < 0.005) {
+    throw new Error('bounding box too small after normalization');
+  }
+
  const left = Math.round(x1 * W);
  const top = Math.round(y1 * H);
  const width = Math.round((x2 - x1) * W);
@ -151,39 +168,132 @@ async function withConcurrency<T>(
  return results;
 }

+// ── Frame quality pre-filtering ──────────────────────────────────────
+
+interface FrameQuality {
+  valid: boolean;
+  meanBrightness: number;
+  variance: number;
+}
+
+async function assessFrameQuality(imagePath: string): Promise<FrameQuality> {
+  const sharp = (await import('sharp')).default;
+  const { data, info } = await sharp(imagePath)
+    .grayscale()
+    .raw()
+    .toBuffer({ resolveWithObject: true });
+
+  const pixels = new Uint8Array(data);
+  let sum = 0;
+  let sumSq = 0;
+  for (let i = 0; i < pixels.length; i++) {
+    sum += pixels[i];
+    sumSq += pixels[i] * pixels[i];
+  }
+  const mean = sum / pixels.length;
+  const variance = sumSq / pixels.length - mean * mean;
+
+  // Skip near-black, near-white, or very low variance (blurry/blank/transition)
+  const valid = mean > 15 && mean < 240 && variance > 50;
+  return { valid, meanBrightness: mean, variance };
+}
+
+async function filterQualityFrames(frames: ExtractedFrame[]): Promise<ExtractedFrame[]> {
+  const results = await Promise.all(
+    frames.map(async (frame) => {
+      try {
+        const q = await assessFrameQuality(frame.imagePath);
+        return { frame, valid: q.valid };
+      } catch {
+        return { frame, valid: true };
+      }
+    }),
+  );
+  const valid = results.filter(r => r.valid).map(r => r.frame);
+  return valid.length > 0 ? valid : frames;
+}
+
+function isValidBoundingBox(bbox: [number, number, number, number]): boolean {
+  const [x1, y1, x2, y2] = bbox;
+  return (
+    x1 >= 0 && x1 <= 1 &&
+    y1 >= 0 && y1 <= 1 &&
+    x2 >= 0 && x2 <= 1 &&
+    y2 >= 0 && y2 <= 1 &&
+    x1 < x2 &&
+    y1 < y2 &&
+    (x2 - x1) * (y2 - y1) > 0.005
+  );
+}
+
 // Skips Pass 1 filter entirely — ranks all frames and always returns the best one.
 // Evenly samples down to maxCandidates when there are too many frames.
 export async function detectBestFrame(
  frames: ExtractedFrame[],
-  concurrency: number = 10,
  visionConfig: VisionConfig,
  maxCandidates: number = 20,
 ): Promise<ProductFrame | null> {
  if (frames.length === 0) return null;

-  const model = createVisionModel(visionConfig);
+  // 1. Filter out obviously bad frames (black, white, blurry)
+  let candidates = await filterQualityFrames(frames);

-  let candidates = frames;
-  if (frames.length > maxCandidates) {
-    const step = frames.length / maxCandidates;
-    candidates = Array.from({ length: maxCandidates }, (_, i) => frames[Math.floor(i * step)]);
+  // 2. Sample if too many
+  if (candidates.length > maxCandidates) {
+    const step = candidates.length / maxCandidates;
+    candidates = Array.from({ length: maxCandidates }, (_, i) => candidates[Math.floor(i * step)]);
  }

+  const model = createVisionModel(visionConfig);
+
+  // 3. Try Vision ranking with error isolation
+  try {
    const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);

+    if (isValidBoundingBox(boundingBox)) {
      const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
+      try {
        await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
-
+      } catch {
+        // cropping is optional — keep original frame
+      }
      return {
        frameIndex: bestFrame.frameIndex,
        timestampSeconds: bestFrame.timestampSeconds,
        imagePath: bestFrame.imagePath,
-    croppedImagePath: croppedPath,
+        ...(croppedPath ? { croppedImagePath: croppedPath } : {}),
        confidence: 0.95,
        description,
        boundingHint: reasoning,
      };
    }
+  } catch {
+    // Vision ranking failed — fall through to fallback
+  }
+
+  // 4. Fallback: rank by frame quality (variance) and return the sharpest
+  const withQuality = await Promise.all(
+    candidates.map(async (f) => {
+      try {
+        const q = await assessFrameQuality(f.imagePath);
+        return { frame: f, score: q.variance };
+      } catch {
+        return { frame: f, score: 0 };
+      }
+    }),
+  );
+  withQuality.sort((a, b) => b.score - a.score);
+  const best = withQuality[0].frame;
+
+  return {
+    frameIndex: best.frameIndex,
+    timestampSeconds: best.timestampSeconds,
+    imagePath: best.imagePath,
+    confidence: 0.5,
+    description: 'product frame (auto-selected)',
+    boundingHint: 'picked by frame quality analysis (Vision ranking failed)',
+  };
+}

 export async function detectProductFrames(
  frames: ExtractedFrame[],
@ -203,18 +313,32 @@ export async function detectProductFrames(
  if (candidates.length === 0) return [];

  // Pass 2: single comparative call — model sees all candidates at once
+  let bestSnapshot: ProductFrame | undefined;
+  try {
    const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);

+    if (isValidBoundingBox(boundingBox)) {
      const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
+      try {
        await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
-
-  return [{
+      } catch {}
+      bestSnapshot = {
        frameIndex: bestFrame.frameIndex,
        timestampSeconds: bestFrame.timestampSeconds,
        imagePath: bestFrame.imagePath,
-    croppedImagePath: croppedPath,
+        ...(croppedPath ? { croppedImagePath: croppedPath } : {}),
        confidence: 0.95,
        description,
        boundingHint: reasoning,
-  }];
+      };
+    }
+  } catch {
+    // ranking failed
+  }
+
+  if (!bestSnapshot) {
+    return [];
+  }
+
+  return [bestSnapshot];
 }
--- a/src/types.ts
+++ b/src/types.ts
@ -1,4 +1,4 @@
-export type Command = 'detect' | 'search' | 'detect-and-search' | 'detect-best' | 'detect-best-and-search' | 'rerank' | 'session';
+export type Command = 'detect' | 'search' | 'detect-and-search' | 'detect-best' | 'detect-best-and-search' | 'detect-video' | 'detect-video-and-search' | 'rerank' | 'session';

 export interface SearchItem {
  num_iid: number;
@ -51,4 +51,17 @@ export interface SearchResult {
  error?: string;
 }

-export type OutputResult = DetectResult | SearchResult;
+export interface DetectVideoResult {
+  status: 'success' | 'failed';
+  command: Command;
+  dryRun: boolean;
+  videoPath?: string;
+  videoUrl?: string;
+  description?: string;
+  keyword?: string;
+  searchResults?: SearchItem[];
+  rerank?: unknown;
+  error?: string;
+}
+
+export type OutputResult = DetectResult | SearchResult | DetectVideoResult;
--- a/src/video-analyzer.ts
+++ b/src/video-analyzer.ts
@ -0,0 +1,97 @@
+import * as fs from 'fs';
+import type { VisionConfig } from './index.ts';
+import { createSkillClient } from './auth-cli.ts';
+
+const UPLOAD_ENDPOINT =
+  process.env.ONEBOUND_UPLOAD_ENDPOINT ||
+  'http://localhost:3202/api/v1/tasks/upload-image';
+
+/**
+ * Upload a video file to get a public URL.
+ *
+ * Uses direct HTTP fetch (not auth-rt CLI) to avoid E2BIG errors
+ * when the base64-encoded video exceeds the command-line argument limit.
+ */
+export async function uploadVideo(videoPath: string): Promise<string> {
+  const client = createSkillClient();
+  const { accessToken } = await client.session();
+
+  const videoBuffer = fs.readFileSync(videoPath);
+  const ext = videoPath.match(/\.(\w+)$/)?.[1] || 'mp4';
+  const filename = `video-${Date.now()}.${ext}`;
+  const contentType = ext === 'mov' ? 'video/quicktime' : `video/${ext}`;
+
+  const response = await fetch(UPLOAD_ENDPOINT, {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${accessToken}`,
+      'Content-Type': 'application/json',
+    },
+    body: JSON.stringify({
+      data: videoBuffer.toString('base64'),
+      filename,
+      contentType,
+    }),
+  });
+
+  if (!response.ok) {
+    const errBody = await response.text().catch(() => 'unknown');
+    throw new Error(`Video upload failed (${response.status}): ${errBody.slice(0, 300)}`);
+  }
+
+  const json = (await response.json()) as { url?: string };
+  if (!json.url) throw new Error('Upload response missing url');
+  return json.url;
+}
+
+export interface VideoAnalysis {
+  description: string;
+  rawResponse?: string;
+}
+
+export async function analyzeVideo(
+  videoUrl: string,
+  config: VisionConfig,
+): Promise<VideoAnalysis> {
+  const response = await fetch(`${config.baseURL}/v1/chat/completions`, {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${config.apiKey}`,
+      'Content-Type': 'application/json',
+    },
+    body: JSON.stringify({
+      model: config.model,
+      messages: [
+        {
+          role: 'user',
+          content: [
+            {
+              type: 'video_url',
+              video_url: { url: videoUrl },
+            },
+            {
+              type: 'text',
+              text: '找出视频中的商品主体，用中文简要描述商品名称、材质、颜色、功能。',
+            },
+          ],
+        },
+      ],
+      max_tokens: 500,
+    }),
+  });
+
+  if (!response.ok) {
+    const errBody = await response.text().catch(() => 'unknown');
+    throw new Error(
+      `Video analysis API error (${response.status}): ${errBody.slice(0, 500)}`,
+    );
+  }
+
+  const json = (await response.json()) as any;
+  const content = json?.choices?.[0]?.message?.content;
+  if (!content) {
+    throw new Error('Video analysis returned empty response');
+  }
+
+  return { description: content.trim(), rawResponse: JSON.stringify(json) };
+}