From 6bc4e1d3b4082499db3b6f4c9019e79291cab156 Mon Sep 17 00:00:00 2001
From: ywkj <tabvim.dev@gmail.com>
Date: Sun, 26 Apr 2026 15:01:42 +0800
Subject: [PATCH] feat: image-only pipeline with LLM post-filter for category
 accuracy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Drop video-understanding flow (detect-video, video-analyzer.ts) — image
  search is the only path now since text/video keywords return broad results.
- Add container-aware frame selection: detect rack/holder products, restrict
  ranking to the earliest 40% of frames so empty/unboxing shots win over
  loaded ones (image search was matching shoes-on-rack instead of the rack).
- Switch container check from generateObject (silently fails on this model)
  to generateText with a YES/NO answer.
- Add post-filter step: send the snapshot + each result's pic_url to the
  vision model in batches, drop results whose category doesn't match the
  detected product description. Cuts 50 raw hits to ~10 same-type matches.
- When post-filter succeeds, sort by sales directly instead of running the
  keyword-intersection rerank, which was overriding good filtered results
  with broad keyword fallbacks.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 SKILL.md                |  88 ++++++++--------
 scripts/run.ts          |  10 +-
 src/index.ts            | 219 +++++++++++++++++++++++++++-------------
 src/post-filter.ts      | 106 +++++++++++++++++++
 src/product-detector.ts |  90 +++++++++++++++--
 src/types.ts            |  50 ++++++---
 src/video-analyzer.ts   |  97 ------------------
 7 files changed, 426 insertions(+), 234 deletions(-)
 create mode 100644 src/post-filter.ts
 delete mode 100644 src/video-analyzer.ts
diff --git a/SKILL.md b/SKILL.md
index 43595b1..30315a1 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -1,11 +1,11 @@
 ---
 name: video-product-snapshot
-description: "Upload video to API for product analysis and 1688 keyword search. / 上传视频直接识别商品并在1688搜索同款。当用户提供视频想找商品时使用。"
+description: "Extract product snapshot from video and search 1688 by image. / 从视频中提取最佳商品帧，以图搜图在1688找同款。当用户提供视频想找商品时使用。"
 ---
 
-# Video Product Snapshot — 视频商品截图
+# Video Product Snapshot — 视频商品以图搜图
 
-上传视频到 API，由多模态模型识别商品主体，生成中文关键词在 1688 上搜索找到同款商品。
+从视频中截取最清晰的商品帧（容器类产品自动选空载帧），上传图片在 1688 以图搜图找同款。
 
 ## 运行
 
@@ -17,49 +17,61 @@ bun dist/run.js <command> [args] [--dry-run]
 
 | 命令 | 使用场景 |
 |------|---------|
-| `detect-video-and-search <video>` | **推荐。** 上传视频到 API 识别商品，然后 1688 关键词搜索。 |
-| `detect-video <video>` | 只识别商品描述和生成关键词，不搜图。 |
-| `search <image-path>` | 已经有商品截图了，跳过检测直接搜图。 |
+| `detect-best-and-search <video>` | **推荐。** 提取最佳商品帧 → 图搜 → rerank 返回结果。 |
+| `detect-best <video>` | 只提取最佳商品帧，不搜图。 |
+| `detect-and-search <video>` | 两阶段过滤后图搜（比 detect-best 慢）。 |
+| `search <image-path>` | 已有商品图，直接图搜。 |
+| `rerank` | 用关键词对图搜结果交叉过滤。 |
 | `session` | 获取当前认证会话 token。 |
 
-## `detect-video` / `detect-video-and-search`
-
-上传视频到 API 直接识别商品主体。
+## 主命令：`detect-best-and-search`
 
 流程：
-1. 上传视频 → 获取公开 URL（复用现有上传接口）
-2. 调用 LiteLLM（Chat Completions + `video_url`）分析视频内容
-3. 识别商品名称、材质、颜色、功能
-4. 生成中文搜索关键词
-5. 1688 关键词搜索（`detect-video-and-search`）
+1. ffmpeg 按 0.5s 间隔提取帧（最多 60 帧）
+2. 视觉模型检测是否为容器/架子类产品
+3. 容器类：只从前 40% 帧（空载阶段）中选最佳帧
+4. 非容器类：全帧中选最清晰帧
+5. 裁剪商品区域
+6. 上传裁剪图 → 1688 图搜
+7. rerank：图搜结果与关键词搜索结果交叉过滤
 
-依赖：
-- `auth-rt` client key（自动，无需额外配置）
-- LiteLLM 代理支持 `video_url` 内容类型
-- 上传接口返回公开 URL
+## Options for `detect-best` / `detect-best-and-search`
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--interval=<sec>` | `0.5` | 帧采样间隔（秒） |
+| `--max-frames=<n>` | `60` | 最大分析帧数 |
+| `--output-dir=<dir>` | 视频同目录 | 截图保存目录 |
 
 ## 输出格式
 
-### `detect-video-and-search`
+### `detect-best-and-search`
 
 ```json
 {
-  "videoUrl": "https://...",
-  "description": "白色帆布收纳盒，带提手，可折叠",
-  "keyword": "帆布收纳盒",
-  "searchResults": [
-    { "num_iid": 123, "title": "...", "price": "15.00", "promotion_price": "12.00", "sales": 500, "detail_url": "..." }
-  ]
+  "bestSnapshot": {
+    "frameIndex": 7,
+    "timestampSeconds": 3,
+    "imagePath": "/path/to/frame_0007.jpg",
+    "croppedImagePath": "/path/to/frame_0007_cropped.jpg",
+    "description": "黑色金属床底鞋架 可折叠移动"
+  },
+  "rerank": {
+    "keyword": "床底鞋架",
+    "results": [
+      { "num_iid": 123, "title": "...", "price": "44.00", "sales": 87, "detail_url": "..." }
+    ]
+  }
 }
 ```
 
 ## 结果展示格式
 
-将 `searchResults` 格式化为 markdown 表格，**每页 5 行**（不足 5 行则全显示）：
+将 `rerank.results`（优先）或 `searchBody.data.items.item` 格式化为 markdown 表格，**每页 5 行**：
 
 | # | 商品名称 | 价格 | 销量 | 链接 |
 |---|----------|------|------|------|
-| 1 | {title}  | ¥{promotion_price \|\| price} | {sales ?? —}件 | [查看](https://detail.1688.com/offer/{num_iid}.html) |
+| 1 | {title}  | ¥{promotion_price \|\| price} | {sales ?? —}件 | [查看]({detail_url}) |
 
 - 有 `promotion_price` 用促销价，否则用原价
 - `sales` 缺失或为零时显示 `—`
@@ -67,28 +79,16 @@ bun dist/run.js <command> [args] [--dry-run]
 
 ## 执行规则
 
-### 视频命令（慢 — 用 sub-agent 执行）
+1. **不要重试。** 命令失败就直接报错。
+2. **信任工具输出。** CLI 内部已处理 session 管理和错误格式化。
+3. **慢命令用 sub-agent**（detect-best-and-search / detect-and-search），直接运行会超时。
 
-涉及命令：`detect-video-and-search`、`detect-video`
-
-使用 `sessions_spawn` 创建 sub-agent 执行，**不要直接运行**。
+### sub-agent 执行模板
 
 ```
 sessions_spawn(
-  task: "Run this command and return the raw JSON output:\n\nbun dist/run.js <完整命令>\n\nCopy the entire JSON output as your reply.",
+  task: "cd /path/to/skill && Run this command and return the raw JSON output:\n\nbun dist/run.js detect-best-and-search <video-path>\n\nCopy the entire JSON output as your reply.",
   label: "video-product-snapshot",
   runTimeoutSeconds: 300,
 )
 ```
-
-- 通知用户处理已开始，告知 `runId`
-- 等待 sub-agent 返回结果，然后解析并展示
-
-### `search` 和 `session`（快 — 直接运行）
-
-直接在本会话中运行，不需要 sub-agent。
-
-### 通用规则
-
-1. **不要重试。** 命令失败就直接报错。
-2. **信任工具输出。** CLI 内部已处理 session 管理和错误格式化。
diff --git a/scripts/run.ts b/scripts/run.ts
index 105a00d..6042442 100644
--- a/scripts/run.ts
+++ b/scripts/run.ts
@@ -43,11 +43,17 @@ function printUsage(): void {
   detect-and-search <video-path> [options]
       检测最佳商品画面 → 图片搜索 → 关键词重排序
 
+  detect-best <video-path> [options]
+      从视频抽帧并选择最佳商品画面（更快更稳定）
+
+  detect-best-and-search <video-path> [options]
+      最佳画面 → 图片搜索 → 关键词重排序
+
   detect-video <video-path>
-      直接上传视频到 API 识别商品主体，输出商品描述和搜索关键词
+      识别商品描述和搜索关键词（当前实现：从视频抽帧选最佳帧）
 
   detect-video-and-search <video-path>
-      上传视频识别商品 → 1688 关键词搜索 → 重排序
+      识别商品 → 图片搜索 → 1688 关键词重排序（当前实现：从视频抽帧选最佳帧）
 
   rerank --image-results=<json> [--description=<text>] [--keyword=<text>] [--top=<n>]
       通过关键词交并集过滤搜索结果
diff --git a/src/index.ts b/src/index.ts
index 4374991..c1de23e 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,11 +1,10 @@
 import * as fs from 'fs';
 import * as path from 'path';
-import type { Command, DetectOptions, DetectResult, SearchResult, OutputResult, SearchItem, DetectVideoResult } from './types.ts';
+import type { Command, DetectOptions, DetectResult, SearchResult, OutputResult, SearchItem, DetectVideoResult, DetectVideoAndSearchResult } from './types.ts';
 import { createSkillClient } from './auth-cli.ts';
 import { extractFrames } from './frame-extractor.ts';
 import { detectProductFrames, detectBestFrame } from './product-detector.ts';
-import { imageToBase64 } from './frame-extractor.ts';
-import { uploadVideo, analyzeVideo } from './video-analyzer.ts';
+import { postFilterByImage } from './post-filter.ts';
 import { generateText } from 'ai';
 import { createOpenAI } from '@ai-sdk/openai';
 
@@ -184,8 +183,40 @@ async function runDetectBestAndSearch(args: string[], dryRun: boolean): Promise<
   const imageForSearch = best.croppedImagePath || best.imagePath;
   const searchResult = await runSearch([imageForSearch], dryRun) as SearchResult;
 
-  let rerankResult: any = undefined;
+  // Post-filter: drop results whose pic_url isn't the same product type as our snapshot
+  let postFilter: any = undefined;
   if (!dryRun && searchResult.status === 'success' && searchResult.searchBody) {
+    const items: SearchItem[] = (searchResult.searchBody as any)?.data?.items?.item ?? [];
+    if (items.length > 0) {
+      try {
+        const client = createSkillClient();
+        const visionConfig = await loadVisionConfig(client);
+        const result = await postFilterByImage(imageForSearch, items, visionConfig, { description: best.description });
+        (searchResult.searchBody as any).data.items.item = result.kept;
+        postFilter = {
+          totalChecked: result.totalChecked,
+          keptCount: result.kept.length,
+          rejectedCount: result.rejected.length,
+          failed: result.failed,
+        };
+      } catch (e: any) {
+        postFilter = { error: e.message };
+      }
+    }
+  }
+
+  let rerankResult: any = undefined;
+  // If post-filter produced focused results, sort them directly by sales — they're already the best matches.
+  // Otherwise fall back to the keyword-intersection rerank.
+  if (!dryRun && postFilter && !postFilter.error && postFilter.keptCount > 0) {
+    const items: SearchItem[] = (searchResult.searchBody as any)?.data?.items?.item ?? [];
+    const sorted = [...items].sort((a, b) => (b.sales ?? 0) - (a.sales ?? 0)).slice(0, 10);
+    rerankResult = {
+      source: 'post-filter',
+      results: sorted,
+      count: sorted.length,
+    };
+  } else if (!dryRun && searchResult.status === 'success' && searchResult.searchBody) {
     const tmpFile = path.join(path.dirname(imageForSearch), `search_body_${Date.now()}.json`);
     try {
       fs.writeFileSync(tmpFile, JSON.stringify(searchResult.searchBody));
@@ -207,10 +238,87 @@ async function runDetectBestAndSearch(args: string[], dryRun: boolean): Promise<
     searchHttpStatus: searchResult.searchHttpStatus,
     searchBody: searchResult.searchBody,
     searchError: searchResult.error,
+    postFilter,
     rerank: rerankResult,
   } as any;
 }
 
+async function runDetectVideo(args: string[], dryRun: boolean): Promise<DetectVideoResult> {
+  const videoPath = args[0];
+  if (!videoPath) return { status: 'failed', command: 'detect-video', dryRun, error: 'detect-video requires <video-path>' };
+  if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect-video', dryRun, error: `video not found: ${videoPath}` };
+
+  const detectResult = await runDetectBest(args, dryRun) as DetectResult;
+  if (detectResult.status === 'failed') {
+    return { status: 'failed', command: 'detect-video', dryRun, videoPath, error: detectResult.error || 'failed to detect best frame' };
+  }
+  const description = detectResult.bestSnapshot?.description?.trim();
+  const snapshotImagePath = detectResult.bestSnapshot?.croppedImagePath || detectResult.bestSnapshot?.imagePath;
+  if (!description) {
+    return { status: 'failed', command: 'detect-video', dryRun, videoPath, error: 'no product description detected from video' };
+  }
+
+  if (dryRun) {
+    return { status: 'success', command: 'detect-video', dryRun, videoPath, videoUrl: null, description, keyword: '<dry-run-keyword>', snapshotImagePath };
+  }
+
+  const client = createSkillClient();
+  const visionConfig = await loadVisionConfig(client);
+  const keyword = await generateChineseKeyword(description, visionConfig);
+
+  return { status: 'success', command: 'detect-video', dryRun, videoPath, videoUrl: null, description, keyword, snapshotImagePath };
+}
+
+async function runDetectVideoAndSearch(args: string[], dryRun: boolean): Promise<DetectVideoAndSearchResult> {
+  const videoPath = args[0];
+  if (!videoPath) return { status: 'failed', command: 'detect-video-and-search', dryRun, error: 'detect-video-and-search requires <video-path>' };
+  if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect-video-and-search', dryRun, error: `video not found: ${videoPath}` };
+
+  if (dryRun) {
+    return { status: 'success', command: 'detect-video-and-search', dryRun, videoPath, videoUrl: null, description: '<dry-run>', keyword: '<dry-run>', searchResults: [] };
+  }
+
+  // Reuse existing pipeline: best snapshot → image search → keyword rerank
+  const detectAndSearch = await runDetectBestAndSearch(args, dryRun) as any;
+  if (detectAndSearch.status === 'failed') {
+    return { status: 'failed', command: 'detect-video-and-search', dryRun, videoPath, error: detectAndSearch.error || 'detect-best-and-search failed' };
+  }
+
+  const description = String(detectAndSearch.bestSnapshot?.description || '').trim();
+  const rerank = detectAndSearch.rerank;
+  const keyword = String(rerank?.keyword || '').trim();
+  const searchResults = (rerank?.results || []) as SearchItem[];
+
+  // Fallback: if rerank didn't produce anything, do keyword search directly.
+  if (!searchResults.length) {
+    const client = createSkillClient();
+    const visionConfig = await loadVisionConfig(client);
+    const fallbackKeyword = keyword || (description ? await generateChineseKeyword(description, visionConfig) : '');
+    const items = fallbackKeyword ? await keywordSearch(client, fallbackKeyword, 1) : [];
+    return {
+      status: 'success',
+      command: 'detect-video-and-search',
+      dryRun,
+      videoPath,
+      videoUrl: null,
+      description,
+      keyword: fallbackKeyword,
+      searchResults: items,
+    };
+  }
+
+  return {
+    status: 'success',
+    command: 'detect-video-and-search',
+    dryRun,
+    videoPath,
+    videoUrl: null,
+    description,
+    keyword,
+    searchResults,
+  };
+}
+
 async function runDetectAndSearch(args: string[], dryRun: boolean): Promise<OutputResult> {
   const detectResult = await runDetect(args, dryRun) as DetectResult;
   if (detectResult.status === 'failed') return detectResult;
@@ -223,8 +331,40 @@ async function runDetectAndSearch(args: string[], dryRun: boolean): Promise<Outp
   const imageForSearch = best.croppedImagePath || best.imagePath;
   const searchResult = await runSearch([imageForSearch], dryRun) as SearchResult;
 
-  let rerankResult: any = undefined;
+  // Post-filter: drop results whose pic_url isn't the same product type as our snapshot
+  let postFilter: any = undefined;
   if (!dryRun && searchResult.status === 'success' && searchResult.searchBody) {
+    const items: SearchItem[] = (searchResult.searchBody as any)?.data?.items?.item ?? [];
+    if (items.length > 0) {
+      try {
+        const client = createSkillClient();
+        const visionConfig = await loadVisionConfig(client);
+        const result = await postFilterByImage(imageForSearch, items, visionConfig, { description: best.description });
+        (searchResult.searchBody as any).data.items.item = result.kept;
+        postFilter = {
+          totalChecked: result.totalChecked,
+          keptCount: result.kept.length,
+          rejectedCount: result.rejected.length,
+          failed: result.failed,
+        };
+      } catch (e: any) {
+        postFilter = { error: e.message };
+      }
+    }
+  }
+
+  let rerankResult: any = undefined;
+  // If post-filter produced focused results, sort them directly by sales — they're already the best matches.
+  // Otherwise fall back to the keyword-intersection rerank.
+  if (!dryRun && postFilter && !postFilter.error && postFilter.keptCount > 0) {
+    const items: SearchItem[] = (searchResult.searchBody as any)?.data?.items?.item ?? [];
+    const sorted = [...items].sort((a, b) => (b.sales ?? 0) - (a.sales ?? 0)).slice(0, 10);
+    rerankResult = {
+      source: 'post-filter',
+      results: sorted,
+      count: sorted.length,
+    };
+  } else if (!dryRun && searchResult.status === 'success' && searchResult.searchBody) {
     const tmpFile = path.join(path.dirname(imageForSearch), `search_body_${Date.now()}.json`);
     try {
       fs.writeFileSync(tmpFile, JSON.stringify(searchResult.searchBody));
@@ -246,69 +386,11 @@ async function runDetectAndSearch(args: string[], dryRun: boolean): Promise<Outp
     searchHttpStatus: searchResult.searchHttpStatus,
     searchBody: searchResult.searchBody,
     searchError: searchResult.error,
+    postFilter,
     rerank: rerankResult,
   } as any;
 }
 
-async function runDetectVideo(args: string[], dryRun: boolean): Promise<DetectVideoResult> {
-  const videoPath = args[0];
-  if (!videoPath) return { status: 'failed', command: 'detect-video', dryRun, error: 'detect-video requires <video-path>' };
-  if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect-video', dryRun, error: `video not found: ${videoPath}` };
-
-  if (dryRun) {
-    return { status: 'success', command: 'detect-video', dryRun, videoPath };
-  }
-
-  const client = createSkillClient();
-  const visionConfig = await loadVisionConfig(client);
-
-  // 1. Upload video to get public URL
-  const videoUrl = await uploadVideo(videoPath);
-
-  // 2. Analyze video via LLM
-  const { description } = await analyzeVideo(videoUrl, visionConfig);
-
-  // 3. Generate Chinese search keyword
-  const keyword = await generateChineseKeyword(description, visionConfig);
-
-  return {
-    status: 'success',
-    command: 'detect-video',
-    dryRun,
-    videoPath,
-    videoUrl,
-    description,
-    keyword,
-  };
-}
-
-async function runDetectVideoAndSearch(args: string[], dryRun: boolean): Promise<DetectVideoResult> {
-  const result = await runDetectVideo(args, dryRun) as DetectVideoResult;
-  if (result.status === 'failed') return result;
-
-  if (dryRun) return { ...result, command: 'detect-video-and-search' };
-
-  const client = createSkillClient();
-
-  // Search 1688 with keyword directly (no rerank — image-based rerank doesn't apply to text search)
-  let searchResults: SearchItem[] = [];
-  if (result.keyword) {
-    try {
-      const items = await keywordSearch(client, result.keyword);
-      // Sort by sales descending
-      searchResults = items.sort((a, b) => (b.sales ?? 0) - (a.sales ?? 0));
-    } catch (e: any) {
-      return { ...result, command: 'detect-video-and-search', status: 'failed', error: `keyword search failed: ${e.message}` };
-    }
-  }
-
-  return {
-    ...result,
-    command: 'detect-video-and-search',
-    searchResults,
-  };
-}
-
 function parseDetectOptions(videoPath: string, args: string[]): DetectOptions {
   const outputDir = getFlag(args, '--output-dir') || path.join(
     path.dirname(videoPath),
@@ -388,8 +470,9 @@ function extractKeywordsFromTitles(items: SearchItem[], topN = 5): string {
 
 async function runRerank(args: string[], dryRun: boolean): Promise<OutputResult> {
   // --image-results=<path> --keyword=<text> --top=<n>
-  const imageResultsArg = getFlag(args, '--image-results') || args[0];
-  const keywordArg = getFlag(args, '--keyword') || args[1];
+  const positionals = args.filter((a) => !a.startsWith('--'));
+  const imageResultsArg = getFlag(args, '--image-results') || positionals[0];
+  const keywordArg = getFlag(args, '--keyword') || positionals[1];
   const topN = parseInt(getFlag(args, '--top') || '10', 10);
 
   const description = getFlag(args, '--description') || '';
@@ -465,7 +548,3 @@ async function runRerank(args: string[], dryRun: boolean): Promise<OutputResult>
     results: sorted,
   } as any;
 }
-
-function parseJsonSafe(text: string): unknown {
-  try { return JSON.parse(text); } catch { return text; }
-}
diff --git a/src/post-filter.ts b/src/post-filter.ts
new file mode 100644
index 0000000..719a324
--- /dev/null
+++ b/src/post-filter.ts
@@ -0,0 +1,106 @@
+import { generateText } from 'ai';
+import { createOpenAI } from '@ai-sdk/openai';
+import type { SearchItem } from './types.ts';
+import type { VisionConfig } from './index.ts';
+import { imageToBase64 } from './frame-extractor.ts';
+
+export interface PostFilterResult {
+  kept: SearchItem[];
+  rejected: SearchItem[];
+  totalChecked: number;
+  failed: boolean;
+}
+
+const FILTER_PROMPT = (count: number, description?: string) => {
+  const productLine = description
+    ? `查询商品是：${description}`
+    : '第1张图是查询商品。';
+  return `${productLine}
+后面的 ${count} 张图是搜索结果。
+
+任务：判断每张候选图是否与查询商品是**完全相同的具体产品类型**。
+- 必须是同一个具体产品（例如：查询是"鞋架"，候选必须也是鞋架；不是其他类型的架子如纸巾架、首饰架、收纳盒）
+- 颜色、材质、款式、尺寸不同但同一具体类型 → 算同类
+- 用途不同就不算同类（例如：查询是鞋架 vs 候选是纸巾架 → 不算；查询是鞋架 vs 候选是床下收纳箱 → 不算，除非明确是鞋类收纳）
+- 关键判断：候选商品的主要用途是否与查询商品一致
+
+按候选图顺序输出每一张的判断，每行一个，格式严格遵守：
+1: YES
+2: NO
+3: YES
+...
+
+只输出 ${count} 行结果，不要解释，不要前后空行。`;
+};
+
+function createModel(config: VisionConfig) {
+  const provider = createOpenAI({ apiKey: config.apiKey, baseURL: config.baseURL });
+  return provider(config.model);
+}
+
+async function classifyBatch(
+  model: ReturnType<ReturnType<typeof createOpenAI>>,
+  queryImageDataUrl: string,
+  batch: SearchItem[],
+  description?: string,
+): Promise<boolean[]> {
+  const content: any[] = [{ type: 'image', image: queryImageDataUrl }];
+  for (const item of batch) {
+    content.push({ type: 'image', image: item.pic_url });
+  }
+  content.push({ type: 'text', text: FILTER_PROMPT(batch.length, description) });
+
+  const { text } = await generateText({
+    model,
+    messages: [{ role: 'user', content }],
+    maxTokens: 200,
+  });
+
+  const flags = batch.map(() => false);
+  for (const line of text.split('\n')) {
+    const m = line.match(/^\s*(\d+)\s*[:：]\s*(YES|NO|是|否)/i);
+    if (!m) continue;
+    const idx = parseInt(m[1], 10) - 1;
+    const yes = /YES|是/i.test(m[2]);
+    if (idx >= 0 && idx < flags.length) flags[idx] = yes;
+  }
+  return flags;
+}
+
+export async function postFilterByImage(
+  queryImagePath: string,
+  items: SearchItem[],
+  visionConfig: VisionConfig,
+  options: { description?: string; batchSize?: number } = {},
+): Promise<PostFilterResult> {
+  if (items.length === 0) {
+    return { kept: [], rejected: [], totalChecked: 0, failed: false };
+  }
+
+  const batchSize = options.batchSize ?? 10;
+  const description = options.description;
+
+  const model = createModel(visionConfig);
+  const queryDataUrl = `data:image/jpeg;base64,${imageToBase64(queryImagePath)}`;
+
+  const kept: SearchItem[] = [];
+  const rejected: SearchItem[] = [];
+  let anyFailed = false;
+
+  for (let i = 0; i < items.length; i += batchSize) {
+    const batch = items.slice(i, i + batchSize);
+    try {
+      const flags = await classifyBatch(model, queryDataUrl, batch, description);
+      batch.forEach((item, idx) => {
+        if (flags[idx]) kept.push(item);
+        else rejected.push(item);
+      });
+    } catch {
+      // On batch failure, keep items (don't lose them) but flag the run as partial
+      anyFailed = true;
+      kept.push(...batch);
+    }
+  }
+
+  return { kept, rejected, totalChecked: items.length, failed: anyFailed };
+}
diff --git a/src/product-detector.ts b/src/product-detector.ts
index 0f12d82..cfebd44 100644
--- a/src/product-detector.ts
+++ b/src/product-detector.ts
@@ -1,4 +1,4 @@
-import { generateObject } from 'ai';
+import { generateObject, generateText } from 'ai';
 import { createOpenAI } from '@ai-sdk/openai';
 import { z } from 'zod';
 import type { ExtractedFrame } from './frame-extractor.ts';
@@ -28,7 +28,38 @@ Discard (keep=false) if: only hands/texture/contents visible, motion blur, black
 
 reason options: product_visible | content_only | hands_only | blur | transition | background_only`;
 
-const RANKING_PROMPT = (count: number) => `You are selecting the single best product frame from ${count} video frames for ecommerce search.
+const CONTAINER_CHECK_PROMPT = `Is the main product in this image a CONTAINER, RACK, or HOLDER (something designed to store/hold other items)?
+Examples YES: shoe rack, shelf, storage box, organizer, basket, drawer, wardrobe, trolley, bin, tray, cabinet.
+Examples NO: shoes, clothing, electronics, food, toys, cosmetics, tools.
+Reply with only one word: YES or NO.`;
+
+const RANKING_PROMPT_CONTAINER = (count: number) => `You are selecting ONE frame from ${count} video frames to use as the query image for an ecommerce reverse-image search.
+
+The hero product is a CONTAINER / RACK / HOLDER / ORGANIZER.
+
+CRITICAL CONSTRAINT — read this first:
+Image search engines identify objects by visual appearance. If the container holds items (shoes, clothes, etc.), the search engine will match those ITEMS, not the container — returning completely wrong products.
+
+YOUR ONLY JOB: find the frame where the container structure itself is most visible with the FEWEST or NO items inside.
+
+ABSOLUTE PRIORITY ORDER (do not deviate):
+1. Frame with container completely EMPTY — highest priority regardless of angle or assembly state
+2. Frame with container partially assembled or partially visible but EMPTY — still better than any loaded frame
+3. Frame with fewest items inside (1-2 items, mostly empty)
+4. Frame with moderate load — only if no emptier option exists
+5. Frame fully loaded — last resort only if no other frames exist
+
+A frame showing the rack mid-assembly with zero items is ALWAYS better than a perfectly-lit fully-assembled rack filled with shoes.
+
+Frames are numbered 0 to ${count - 1} in order shown. You MUST pick ONE.
+
+Return:
+- bestFrameIndex: 0-based index of the emptiest container frame
+- description: concise Chinese search query ≤12 words (container type + material + color + key feature)
+- reasoning: describe how many items are visible inside the chosen frame and why it's the emptiest option
+- boundingBox: tight box of the PRODUCT STRUCTURE ONLY as [x1, y1, x2, y2] normalized 0.0–1.0. Exclude any items stored inside.`;
+
+const RANKING_PROMPT_GENERAL = (count: number) => `You are selecting the single best product frame from ${count} video frames for ecommerce search.
 
 Frames are numbered 0 to ${count - 1} in order shown.
 
@@ -72,15 +103,52 @@ async function filterFrame(
   return object.keep;
 }
 
+
+async function isContainerProduct(
+  firstFrame: ExtractedFrame,
+  model: ReturnType<ReturnType<typeof createOpenAI>>,
+): Promise<boolean> {
+  try {
+    const { text } = await generateText({
+      model,
+      messages: [{
+        role: 'user',
+        content: [
+          { type: 'image', image: `data:image/jpeg;base64,${imageToBase64(firstFrame.imagePath)}` },
+          { type: 'text', text: CONTAINER_CHECK_PROMPT },
+        ],
+      }],
+      maxTokens: 5,
+    });
+    return text.trim().toUpperCase().startsWith('Y');
+  } catch {
+    return false;
+  }
+}
+
+
+function takeEarliestFrames(candidates: ExtractedFrame[], fraction: number = 0.4): ExtractedFrame[] {
+  // Ecommerce videos show the container empty/unboxing early, then full.
+  // Taking the first 40% of frames reliably captures empty states.
+  const sorted = [...candidates].sort((a, b) => a.frameIndex - b.frameIndex);
+  const cutoff = Math.max(1, Math.ceil(sorted.length * fraction));
+  return sorted.slice(0, cutoff);
+}
+
 async function rankCandidates(
   candidates: ExtractedFrame[],
   model: ReturnType<ReturnType<typeof createOpenAI>>,
+  isContainer: boolean,
 ): Promise<{ bestFrame: ExtractedFrame; description: string; reasoning: string; boundingBox: [number, number, number, number] }> {
   const imageContent = candidates.map((f) => ({
     type: 'image' as const,
     image: `data:image/jpeg;base64,${imageToBase64(f.imagePath)}`,
   }));
 
+  const prompt = isContainer
+    ? RANKING_PROMPT_CONTAINER(candidates.length)
+    : RANKING_PROMPT_GENERAL(candidates.length);
+
   const { object } = await generateObject({
     model,
     schema: RankingSchema,
@@ -89,7 +157,7 @@ async function rankCandidates(
       role: 'user',
       content: [
         ...imageContent,
-        { type: 'text', text: RANKING_PROMPT(candidates.length) },
+        { type: 'text', text: prompt },
       ],
     }],
   });
@@ -246,9 +314,18 @@ export async function detectBestFrame(
 
   const model = createVisionModel(visionConfig);
 
-  // 3. Try Vision ranking with error isolation
+  // 3. Check if product is a container/rack type (use first candidate frame)
+  const container = await isContainerProduct(candidates[0], model);
+
+  // 4. For containers: restrict ranking to earliest frames (empty/unboxing phase)
+  if (container) {
+    const early = takeEarliestFrames(candidates);
+    if (early.length > 0) candidates = early;
+  }
+
+  // 5. Try Vision ranking with error isolation
   try {
-    const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);
+    const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model, container);
 
     if (isValidBoundingBox(boundingBox)) {
       const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
@@ -313,9 +390,10 @@ export async function detectProductFrames(
   if (candidates.length === 0) return [];
 
   // Pass 2: single comparative call — model sees all candidates at once
+  const container = await isContainerProduct(candidates[0], model);
   let bestSnapshot: ProductFrame | undefined;
   try {
-    const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);
+    const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model, container);
 
     if (isValidBoundingBox(boundingBox)) {
       const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
diff --git a/src/types.ts b/src/types.ts
index 1bac01d..6c1f695 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -1,4 +1,13 @@
-export type Command = 'detect' | 'search' | 'detect-and-search' | 'detect-best' | 'detect-best-and-search' | 'detect-video' | 'detect-video-and-search' | 'rerank' | 'session';
+export type Command =
+  | 'detect'
+  | 'search'
+  | 'detect-and-search'
+  | 'detect-best'
+  | 'detect-best-and-search'
+  | 'detect-video'
+  | 'detect-video-and-search'
+  | 'rerank'
+  | 'session';
 
 export interface SearchItem {
   num_iid: number;
@@ -11,6 +20,30 @@ export interface SearchItem {
   detail_url: string;
 }
 
+export interface DetectVideoResult {
+  status: 'success' | 'failed';
+  command: 'detect-video';
+  dryRun: boolean;
+  videoPath?: string;
+  videoUrl?: string | null;
+  description?: string;
+  keyword?: string;
+  snapshotImagePath?: string;
+  error?: string;
+}
+
+export interface DetectVideoAndSearchResult {
+  status: 'success' | 'failed';
+  command: 'detect-video-and-search';
+  dryRun: boolean;
+  videoPath?: string;
+  videoUrl?: string | null;
+  description?: string;
+  keyword?: string;
+  searchResults?: SearchItem[];
+  error?: string;
+}
+
 export interface DetectOptions {
   videoPath: string;
   intervalSeconds: number;
@@ -51,17 +84,4 @@ export interface SearchResult {
   error?: string;
 }
 
-export interface DetectVideoResult {
-  status: 'success' | 'failed';
-  command: Command;
-  dryRun: boolean;
-  videoPath?: string;
-  videoUrl?: string;
-  description?: string;
-  keyword?: string;
-  searchResults?: SearchItem[];
-  rerank?: unknown;
-  error?: string;
-}
-
-export type OutputResult = DetectResult | SearchResult | DetectVideoResult;
+export type OutputResult = DetectResult | SearchResult | DetectVideoResult | DetectVideoAndSearchResult;
diff --git a/src/video-analyzer.ts b/src/video-analyzer.ts
deleted file mode 100644
index 6ba55af..0000000
--- a/src/video-analyzer.ts
+++ /dev/null
@@ -1,97 +0,0 @@
-import * as fs from 'fs';
-import type { VisionConfig } from './index.ts';
-import { createSkillClient } from './auth-cli.ts';
-
-const UPLOAD_ENDPOINT =
-  process.env.ONEBOUND_UPLOAD_ENDPOINT ||
-  'http://localhost:3202/api/v1/tasks/upload-image';
-
-/**
- * Upload a video file to get a public URL.
- *
- * Uses direct HTTP fetch (not auth-rt CLI) to avoid E2BIG errors
- * when the base64-encoded video exceeds the command-line argument limit.
- */
-export async function uploadVideo(videoPath: string): Promise<string> {
-  const client = createSkillClient();
-  const { accessToken } = await client.session();
-
-  const videoBuffer = fs.readFileSync(videoPath);
-  const ext = videoPath.match(/\.(\w+)$/)?.[1] || 'mp4';
-  const filename = `video-${Date.now()}.${ext}`;
-  const contentType = ext === 'mov' ? 'video/quicktime' : `video/${ext}`;
-
-  const response = await fetch(UPLOAD_ENDPOINT, {
-    method: 'POST',
-    headers: {
-      Authorization: `Bearer ${accessToken}`,
-      'Content-Type': 'application/json',
-    },
-    body: JSON.stringify({
-      data: videoBuffer.toString('base64'),
-      filename,
-      contentType,
-    }),
-  });
-
-  if (!response.ok) {
-    const errBody = await response.text().catch(() => 'unknown');
-    throw new Error(`Video upload failed (${response.status}): ${errBody.slice(0, 300)}`);
-  }
-
-  const json = (await response.json()) as { url?: string };
-  if (!json.url) throw new Error('Upload response missing url');
-  return json.url;
-}
-
-export interface VideoAnalysis {
-  description: string;
-  rawResponse?: string;
-}
-
-export async function analyzeVideo(
-  videoUrl: string,
-  config: VisionConfig,
-): Promise<VideoAnalysis> {
-  const response = await fetch(`${config.baseURL}/v1/chat/completions`, {
-    method: 'POST',
-    headers: {
-      Authorization: `Bearer ${config.apiKey}`,
-      'Content-Type': 'application/json',
-    },
-    body: JSON.stringify({
-      model: config.model,
-      messages: [
-        {
-          role: 'user',
-          content: [
-            {
-              type: 'video_url',
-              video_url: { url: videoUrl },
-            },
-            {
-              type: 'text',
-              text: '找出视频中的商品主体，用中文简要描述商品名称、材质、颜色、功能。',
-            },
-          ],
-        },
-      ],
-      max_tokens: 500,
-    }),
-  });
-
-  if (!response.ok) {
-    const errBody = await response.text().catch(() => 'unknown');
-    throw new Error(
-      `Video analysis API error (${response.status}): ${errBody.slice(0, 500)}`,
-    );
-  }
-
-  const json = (await response.json()) as any;
-  const content = json?.choices?.[0]?.message?.content;
-  if (!content) {
-    throw new Error('Video analysis returned empty response');
-  }
-
-  return { description: content.trim(), rawResponse: JSON.stringify(json) };
-}