feat: add detect-video command using direct video upload + API analysis
register-skill-release / register (push) Successful in 16s Details

- New detect-video / detect-video-and-search commands: upload video to get
  public URL, analyze via LiteLLM (video_url), generate keyword, search 1688
- New src/video-analyzer.ts: upload via direct HTTP (bypasses auth-rt CLI
  arg length limit), analyze via Chat Completions with video_url content
- Frame-based pipeline robustness: quality pre-filtering (skip black/blurry
  frames), bounding box normalization/validation, crop failure tolerance,
  Vision ranking fallback to sharpness-based selection
- Improve ranking prompt: force pick one frame, Chinese description
- Update docs to recommend detect-video-and-search as primary command

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
ywkj 2026-04-25 16:30:01 +08:00
parent 91a623751d
commit db4735e54e
6 changed files with 371 additions and 49 deletions

View File

@ -17,12 +17,30 @@ bun dist/run.js <command> [args] [--dry-run]
| 命令 | 使用场景 |
|------|---------|
| `detect-best-and-search <video>` | **视频输入的默认命令。** 始终找出最佳画面(不管置信度高低),然后搜图。 |
| `detect-best <video>` | 只提取最佳画面,不搜图。 |
| `detect-video-and-search <video>` | **推荐。** 直接上传视频到 API 识别商品主体,然后 1688 关键词搜索。跳过本地抽帧,无需 Vision API。 |
| `detect-best-and-search <video>` | 旧版。抽帧 + Vision 排名 + 搜图。需要 Vision API key。 |
| `detect-video <video>` | 只识别商品描述和生成关键词,不搜图。 |
| `detect-best <video>` | 旧版。只提取最佳画面,不搜图。 |
| `search <image-path>` | 已经有商品截图了,跳过检测直接搜图。 |
| `detect-and-search <video>` | 旧版。过滤可能太严格导致无结果。建议用 `detect-best-and-search`。 |
| `detect-and-search <video>` | 旧版。**不推荐。** |
| `session` | 获取当前认证会话 token。 |
## `detect-video` / `detect-video-and-search`
上传视频到 API 直接识别商品主体,不走本地抽帧。
流程:
1. 上传视频 → 获取公开 URL复用现有上传接口
2. 调用 LiteLLMChat Completions + `video_url`)分析视频内容
3. 识别商品名称、材质、颜色、功能
4. 生成中文搜索关键词
5. 1688 关键词搜索(`detect-video-and-search`
依赖:
- `auth-rt` client key自动无需额外配置
- LiteLLM 代理支持 `video_url` 内容类型
- 上传接口返回公开 URL
## `detect-best` / `detect-best-and-search` 选项
| 参数 | 默认值 | 说明 |
@ -75,7 +93,7 @@ CLI 执行完成后,将 `rerank.results` 格式化为 markdown 表格,**每
### 视频命令(慢 — 用 sub-agent 执行)
涉及命令:`detect-best-and-search`、`detect-best`、`detect-and-search`
涉及命令:`detect-video-and-search`、`detect-best-and-search`、`detect-best`、`detect-and-search`、`detect-video`
使用 `sessions_spawn` 创建 sub-agent 执行,**不要直接运行**。
@ -96,6 +114,6 @@ sessions_spawn(
### 通用规则
1. **视频输入 → 始终用 `detect-best-and-search`。** 不要用 `detect-and-search`
1. **视频输入 → 优先用 `detect-video-and-search`。** 比抽帧方案更可靠。如果没配视频模型,降级到 `detect-best-and-search`不要用 `detect-and-search`
2. **不要重试。** 命令失败就直接报错。
3. **信任工具输出。** CLI 内部已处理 session 管理和错误格式化。

View File

@ -43,6 +43,12 @@ function printUsage(): void {
detect-and-search <video-path> [options]
detect-video <video-path>
API
detect-video-and-search <video-path>
1688
rerank --image-results=<json> [--description=<text>] [--keyword=<text>] [--top=<n>]

View File

@ -1,10 +1,11 @@
import * as fs from 'fs';
import * as path from 'path';
import type { Command, DetectOptions, DetectResult, SearchResult, OutputResult, SearchItem } from './types.ts';
import type { Command, DetectOptions, DetectResult, SearchResult, OutputResult, SearchItem, DetectVideoResult } from './types.ts';
import { createSkillClient } from './auth-cli.ts';
import { extractFrames } from './frame-extractor.ts';
import { detectProductFrames, detectBestFrame } from './product-detector.ts';
import { imageToBase64 } from './frame-extractor.ts';
import { uploadVideo, analyzeVideo } from './video-analyzer.ts';
import { generateText } from 'ai';
import { createOpenAI } from '@ai-sdk/openai';
@ -43,6 +44,10 @@ export async function run(
return runDetectBest(args, dryRun);
case 'detect-best-and-search':
return runDetectBestAndSearch(args, dryRun);
case 'detect-video':
return runDetectVideo(args, dryRun);
case 'detect-video-and-search':
return runDetectVideoAndSearch(args, dryRun);
case 'rerank':
return runRerank(args, dryRun);
default:
@ -153,7 +158,7 @@ async function runDetectBest(args: string[], dryRun: boolean): Promise<DetectRes
return { status: 'failed', command: 'detect-best', dryRun, videoPath, error: 'no frames extracted from video' };
}
const best = await detectBestFrame(frames, 10, visionConfig);
const best = await detectBestFrame(frames, visionConfig, 20);
return {
status: 'success',
@ -245,6 +250,65 @@ async function runDetectAndSearch(args: string[], dryRun: boolean): Promise<Outp
} as any;
}
async function runDetectVideo(args: string[], dryRun: boolean): Promise<DetectVideoResult> {
const videoPath = args[0];
if (!videoPath) return { status: 'failed', command: 'detect-video', dryRun, error: 'detect-video requires <video-path>' };
if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect-video', dryRun, error: `video not found: ${videoPath}` };
if (dryRun) {
return { status: 'success', command: 'detect-video', dryRun, videoPath };
}
const client = createSkillClient();
const visionConfig = await loadVisionConfig(client);
// 1. Upload video to get public URL
const videoUrl = await uploadVideo(videoPath);
// 2. Analyze video via LLM
const { description } = await analyzeVideo(videoUrl, visionConfig);
// 3. Generate Chinese search keyword
const keyword = await generateChineseKeyword(description, visionConfig);
return {
status: 'success',
command: 'detect-video',
dryRun,
videoPath,
videoUrl,
description,
keyword,
};
}
async function runDetectVideoAndSearch(args: string[], dryRun: boolean): Promise<DetectVideoResult> {
const result = await runDetectVideo(args, dryRun) as DetectVideoResult;
if (result.status === 'failed') return result;
if (dryRun) return { ...result, command: 'detect-video-and-search' };
const client = createSkillClient();
// Search 1688 with keyword directly (no rerank — image-based rerank doesn't apply to text search)
let searchResults: SearchItem[] = [];
if (result.keyword) {
try {
const items = await keywordSearch(client, result.keyword);
// Sort by sales descending
searchResults = items.sort((a, b) => (b.sales ?? 0) - (a.sales ?? 0));
} catch (e: any) {
return { ...result, command: 'detect-video-and-search', status: 'failed', error: `keyword search failed: ${e.message}` };
}
}
return {
...result,
command: 'detect-video-and-search',
searchResults,
};
}
function parseDetectOptions(videoPath: string, args: string[]): DetectOptions {
const outputDir = getFlag(args, '--output-dir') || path.join(
path.dirname(videoPath),

View File

@ -28,21 +28,23 @@ Discard (keep=false) if: only hands/texture/contents visible, motion blur, black
reason options: product_visible | content_only | hands_only | blur | transition | background_only`;
const RANKING_PROMPT = (count: number) => `You are selecting the single best product image from ${count} video frames for ecommerce image search.
const RANKING_PROMPT = (count: number) => `You are selecting the single best product frame from ${count} video frames for ecommerce search.
The frames are numbered 0 to ${count - 1} in the order shown.
Frames are numbered 0 to ${count - 1} in order shown.
Pick the ONE frame where the HERO PRODUCT is:
1. Cleanest fewest distractions, no hands blocking it, no clutter in foreground
2. Most complete full product silhouette visible, no edges cropped
3. Most isolated product stands out from background clearly
4. Empty/minimal load preferred a product without contents (e.g. an empty rack) beats one stuffed with items if both show the full structure equally
IMPORTANT: You MUST pick ONE frame even if product visibility is imperfect or no frame looks ideal. Always make your best guess.
Pick the frame where the MAIN SELLING PRODUCT is:
1. Most recognizable clearest view of the item being sold
2. Most complete full product silhouette visible, not cropped at edges
3. Cleanest minimal obstruction (hands, clutter, motion blur, labels)
4. Best lit and in focus
Return:
- bestFrameIndex: 0-based index of chosen frame
- description: concise search query under 12 words (product type + material + color + key feature)
- reasoning: one sentence explaining why this frame was chosen
- boundingBox: tight bounding box of the HERO PRODUCT ONLY in the chosen frame as [x1, y1, x2, y2] normalized 0.01.0 (top-left origin). Exclude hands, background, and unrelated objects. The product is assumed to be near the center.`;
- bestFrameIndex: 0-based index
- description: concise search query under 12 words (product type + material + color + key features), in Chinese
- reasoning: one sentence explaining choice
- boundingBox: tight box of the PRODUCT ONLY as [x1, y1, x2, y2] normalized 0.01.0, top-left origin. Exclude hands, background, and unrelated objects. The product is near the center of the frame.`;
function createVisionModel(config: VisionConfig) {
const provider = createOpenAI({ apiKey: config.apiKey, baseURL: config.baseURL });
@ -114,7 +116,17 @@ export async function cropProduct(
let [x1, y1, x2, y2] = boundingBox;
// add padding
// Normalize coords: ensure x1<x2 and y1<y2
if (x1 > x2) [x1, x2] = [x2, x1];
if (y1 > y2) [y1, y2] = [y2, y1];
// Clamp to [0, 1]
x1 = Math.max(0, Math.min(1, x1));
y1 = Math.max(0, Math.min(1, y1));
x2 = Math.max(0, Math.min(1, x2));
y2 = Math.max(0, Math.min(1, y2));
// Add padding
const pw = (x2 - x1) * paddingFactor;
const ph = (y2 - y1) * paddingFactor;
x1 = Math.max(0, x1 - pw);
@ -122,6 +134,11 @@ export async function cropProduct(
x2 = Math.min(1, x2 + pw);
y2 = Math.min(1, y2 + ph);
// Validate minimum area
if (x2 - x1 < 0.005 || y2 - y1 < 0.005) {
throw new Error('bounding box too small after normalization');
}
const left = Math.round(x1 * W);
const top = Math.round(y1 * H);
const width = Math.round((x2 - x1) * W);
@ -151,39 +168,132 @@ async function withConcurrency<T>(
return results;
}
// ── Frame quality pre-filtering ──────────────────────────────────────
interface FrameQuality {
valid: boolean;
meanBrightness: number;
variance: number;
}
async function assessFrameQuality(imagePath: string): Promise<FrameQuality> {
const sharp = (await import('sharp')).default;
const { data, info } = await sharp(imagePath)
.grayscale()
.raw()
.toBuffer({ resolveWithObject: true });
const pixels = new Uint8Array(data);
let sum = 0;
let sumSq = 0;
for (let i = 0; i < pixels.length; i++) {
sum += pixels[i];
sumSq += pixels[i] * pixels[i];
}
const mean = sum / pixels.length;
const variance = sumSq / pixels.length - mean * mean;
// Skip near-black, near-white, or very low variance (blurry/blank/transition)
const valid = mean > 15 && mean < 240 && variance > 50;
return { valid, meanBrightness: mean, variance };
}
async function filterQualityFrames(frames: ExtractedFrame[]): Promise<ExtractedFrame[]> {
const results = await Promise.all(
frames.map(async (frame) => {
try {
const q = await assessFrameQuality(frame.imagePath);
return { frame, valid: q.valid };
} catch {
return { frame, valid: true };
}
}),
);
const valid = results.filter(r => r.valid).map(r => r.frame);
return valid.length > 0 ? valid : frames;
}
function isValidBoundingBox(bbox: [number, number, number, number]): boolean {
const [x1, y1, x2, y2] = bbox;
return (
x1 >= 0 && x1 <= 1 &&
y1 >= 0 && y1 <= 1 &&
x2 >= 0 && x2 <= 1 &&
y2 >= 0 && y2 <= 1 &&
x1 < x2 &&
y1 < y2 &&
(x2 - x1) * (y2 - y1) > 0.005
);
}
// Skips Pass 1 filter entirely — ranks all frames and always returns the best one.
// Evenly samples down to maxCandidates when there are too many frames.
export async function detectBestFrame(
frames: ExtractedFrame[],
concurrency: number = 10,
visionConfig: VisionConfig,
maxCandidates: number = 20,
): Promise<ProductFrame | null> {
if (frames.length === 0) return null;
const model = createVisionModel(visionConfig);
// 1. Filter out obviously bad frames (black, white, blurry)
let candidates = await filterQualityFrames(frames);
let candidates = frames;
if (frames.length > maxCandidates) {
const step = frames.length / maxCandidates;
candidates = Array.from({ length: maxCandidates }, (_, i) => frames[Math.floor(i * step)]);
// 2. Sample if too many
if (candidates.length > maxCandidates) {
const step = candidates.length / maxCandidates;
candidates = Array.from({ length: maxCandidates }, (_, i) => candidates[Math.floor(i * step)]);
}
const model = createVisionModel(visionConfig);
// 3. Try Vision ranking with error isolation
try {
const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);
if (isValidBoundingBox(boundingBox)) {
const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
try {
await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
} catch {
// cropping is optional — keep original frame
}
return {
frameIndex: bestFrame.frameIndex,
timestampSeconds: bestFrame.timestampSeconds,
imagePath: bestFrame.imagePath,
croppedImagePath: croppedPath,
...(croppedPath ? { croppedImagePath: croppedPath } : {}),
confidence: 0.95,
description,
boundingHint: reasoning,
};
}
} catch {
// Vision ranking failed — fall through to fallback
}
// 4. Fallback: rank by frame quality (variance) and return the sharpest
const withQuality = await Promise.all(
candidates.map(async (f) => {
try {
const q = await assessFrameQuality(f.imagePath);
return { frame: f, score: q.variance };
} catch {
return { frame: f, score: 0 };
}
}),
);
withQuality.sort((a, b) => b.score - a.score);
const best = withQuality[0].frame;
return {
frameIndex: best.frameIndex,
timestampSeconds: best.timestampSeconds,
imagePath: best.imagePath,
confidence: 0.5,
description: 'product frame (auto-selected)',
boundingHint: 'picked by frame quality analysis (Vision ranking failed)',
};
}
export async function detectProductFrames(
frames: ExtractedFrame[],
@ -203,18 +313,32 @@ export async function detectProductFrames(
if (candidates.length === 0) return [];
// Pass 2: single comparative call — model sees all candidates at once
let bestSnapshot: ProductFrame | undefined;
try {
const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model);
if (isValidBoundingBox(boundingBox)) {
const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg');
try {
await cropProduct(bestFrame.imagePath, boundingBox, croppedPath);
return [{
} catch {}
bestSnapshot = {
frameIndex: bestFrame.frameIndex,
timestampSeconds: bestFrame.timestampSeconds,
imagePath: bestFrame.imagePath,
croppedImagePath: croppedPath,
...(croppedPath ? { croppedImagePath: croppedPath } : {}),
confidence: 0.95,
description,
boundingHint: reasoning,
}];
};
}
} catch {
// ranking failed
}
if (!bestSnapshot) {
return [];
}
return [bestSnapshot];
}

View File

@ -1,4 +1,4 @@
export type Command = 'detect' | 'search' | 'detect-and-search' | 'detect-best' | 'detect-best-and-search' | 'rerank' | 'session';
export type Command = 'detect' | 'search' | 'detect-and-search' | 'detect-best' | 'detect-best-and-search' | 'detect-video' | 'detect-video-and-search' | 'rerank' | 'session';
export interface SearchItem {
num_iid: number;
@ -51,4 +51,17 @@ export interface SearchResult {
error?: string;
}
export type OutputResult = DetectResult | SearchResult;
export interface DetectVideoResult {
status: 'success' | 'failed';
command: Command;
dryRun: boolean;
videoPath?: string;
videoUrl?: string;
description?: string;
keyword?: string;
searchResults?: SearchItem[];
rerank?: unknown;
error?: string;
}
export type OutputResult = DetectResult | SearchResult | DetectVideoResult;

97
src/video-analyzer.ts Normal file
View File

@ -0,0 +1,97 @@
import * as fs from 'fs';
import type { VisionConfig } from './index.ts';
import { createSkillClient } from './auth-cli.ts';
const UPLOAD_ENDPOINT =
process.env.ONEBOUND_UPLOAD_ENDPOINT ||
'http://localhost:3202/api/v1/tasks/upload-image';
/**
* Upload a video file to get a public URL.
*
* Uses direct HTTP fetch (not auth-rt CLI) to avoid E2BIG errors
* when the base64-encoded video exceeds the command-line argument limit.
*/
export async function uploadVideo(videoPath: string): Promise<string> {
const client = createSkillClient();
const { accessToken } = await client.session();
const videoBuffer = fs.readFileSync(videoPath);
const ext = videoPath.match(/\.(\w+)$/)?.[1] || 'mp4';
const filename = `video-${Date.now()}.${ext}`;
const contentType = ext === 'mov' ? 'video/quicktime' : `video/${ext}`;
const response = await fetch(UPLOAD_ENDPOINT, {
method: 'POST',
headers: {
Authorization: `Bearer ${accessToken}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({
data: videoBuffer.toString('base64'),
filename,
contentType,
}),
});
if (!response.ok) {
const errBody = await response.text().catch(() => 'unknown');
throw new Error(`Video upload failed (${response.status}): ${errBody.slice(0, 300)}`);
}
const json = (await response.json()) as { url?: string };
if (!json.url) throw new Error('Upload response missing url');
return json.url;
}
export interface VideoAnalysis {
description: string;
rawResponse?: string;
}
export async function analyzeVideo(
videoUrl: string,
config: VisionConfig,
): Promise<VideoAnalysis> {
const response = await fetch(`${config.baseURL}/v1/chat/completions`, {
method: 'POST',
headers: {
Authorization: `Bearer ${config.apiKey}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: config.model,
messages: [
{
role: 'user',
content: [
{
type: 'video_url',
video_url: { url: videoUrl },
},
{
type: 'text',
text: '找出视频中的商品主体,用中文简要描述商品名称、材质、颜色、功能。',
},
],
},
],
max_tokens: 500,
}),
});
if (!response.ok) {
const errBody = await response.text().catch(() => 'unknown');
throw new Error(
`Video analysis API error (${response.status}): ${errBody.slice(0, 500)}`,
);
}
const json = (await response.json()) as any;
const content = json?.choices?.[0]?.message?.content;
if (!content) {
throw new Error('Video analysis returned empty response');
}
return { description: content.trim(), rawResponse: JSON.stringify(json) };
}