video-product-finder/src/index.ts

559 lines
22 KiB
TypeScript

import * as fs from 'fs';
import * as path from 'path';
import type { Command, DetectOptions, DetectResult, SearchResult, OutputResult, SearchItem, DetectVideoResult, DetectVideoAndSearchResult } from './types.ts';
import { createSkillClient } from './auth-cli.ts';
import { extractFrames } from './frame-extractor.ts';
import { detectProductFrames, detectBestFrame } from './product-detector.ts';
import { postFilterByImage } from './post-filter.ts';
import { generateText } from 'ai';
import { createOpenAI } from '@ai-sdk/openai';
export interface VisionConfig {
apiKey: string;
baseURL?: string;
model: string;
sessionId?: string;
}
async function loadVisionConfig(client: ReturnType<typeof createSkillClient>): Promise<VisionConfig> {
const cfg = await client.clientConfig();
const apiKey = cfg.metadata?.provider?.api_key;
if (!apiKey) throw new Error('Vision API key not found in client config (metadata.provider.api_key)');
return {
apiKey,
baseURL: cfg.metadata?.provider?.base_url,
model: process.env.VISION_MODEL ?? cfg.metadata?.provider?.model ?? 'aliyun-cp-multimodal',
sessionId: process.env.SKILL_SESSION_ID || `skill_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
};
}
export async function run(
command: Command,
args: string[],
dryRun: boolean,
): Promise<OutputResult> {
switch (command) {
case 'session':
return runSession(dryRun);
case 'detect':
return runDetect(args, dryRun);
case 'search':
return runSearch(args, dryRun);
case 'detect-and-search':
return runDetectAndSearch(args, dryRun);
case 'detect-best':
return runDetectBest(args, dryRun);
case 'detect-best-and-search':
return runDetectBestAndSearch(args, dryRun);
case 'detect-video':
return runDetectVideo(args, dryRun);
case 'detect-video-and-search':
return runDetectVideoAndSearch(args, dryRun);
case 'rerank':
return runRerank(args, dryRun);
default:
return { status: 'failed', command, dryRun, error: `unknown command: ${command}` };
}
}
async function runSession(dryRun: boolean): Promise<OutputResult> {
const client = createSkillClient({ dryRun });
const session = await client.session();
return { status: 'success', command: 'session', dryRun, ...session } as any;
}
async function runDetect(args: string[], dryRun: boolean): Promise<DetectResult> {
const videoPath = args[0];
if (!videoPath) return { status: 'failed', command: 'detect', dryRun, error: 'detect requires <video-path>' };
if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect', dryRun, error: `video not found: ${videoPath}` };
const opts = parseDetectOptions(videoPath, args);
if (dryRun) {
return {
status: 'success', command: 'detect', dryRun,
videoPath, totalFramesExtracted: 0, productFrames: [],
bestSnapshot: undefined,
};
}
const client = createSkillClient();
const visionConfig = await loadVisionConfig(client);
const frames = extractFrames(videoPath, opts.outputDir, opts.intervalSeconds, opts.maxFrames);
const productFrames = await detectProductFrames(frames, opts.minConfidence, opts.concurrency, visionConfig);
return {
status: 'success',
command: 'detect',
dryRun,
videoPath,
totalFramesExtracted: frames.length,
productFrames,
bestSnapshot: productFrames[0],
};
}
async function uploadImage(client: ReturnType<typeof createSkillClient>, imagePath: string): Promise<string> {
const imageBuffer = fs.readFileSync(imagePath);
const filename = `video-snapshot-${Date.now()}.jpg`;
const res = await client.post('/ecom/tasks/upload-image', {
data: imageBuffer.toString('base64'),
filename,
contentType: 'image/jpeg',
});
if (res.status >= 400) throw new Error(`Upload failed: HTTP ${res.status}`);
const json = JSON.parse(res.body) as { url?: string };
if (!json.url) throw new Error('Upload response missing url');
return json.url;
}
async function runSearch(args: string[], dryRun: boolean): Promise<SearchResult> {
const imagePath = args[0];
if (!imagePath) return { status: 'failed', command: 'search', dryRun, error: 'search requires <image-path>' };
if (!fs.existsSync(imagePath)) return { status: 'failed', command: 'search', dryRun, error: `image not found: ${imagePath}` };
if (dryRun) {
return { status: 'success', command: 'search', dryRun, imagePath, searchHttpStatus: 0, searchBody: null };
}
const client = createSkillClient();
let imgid = imagePath;
if (!imagePath.startsWith('http')) {
imgid = await uploadImage(client, imagePath);
}
const res = await client.post('/ecom/tasks/search-by-image', { imgid, page: 1 });
const searchHttpStatus = res.status;
const body = JSON.parse(res.body);
if (res.status >= 400) {
return { status: 'failed', command: 'search', dryRun, imagePath, searchHttpStatus, error: JSON.stringify(body) };
}
return { status: 'success', command: 'search', dryRun, imagePath, searchHttpStatus, searchBody: body };
}
async function runDetectBest(args: string[], dryRun: boolean): Promise<DetectResult> {
const videoPath = args[0];
if (!videoPath) return { status: 'failed', command: 'detect-best', dryRun, error: 'detect-best requires <video-path>' };
if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect-best', dryRun, error: `video not found: ${videoPath}` };
const outputDir = getFlag(args, '--output-dir') || path.join(
path.dirname(videoPath),
`snapshots_${path.basename(videoPath, path.extname(videoPath))}_${Date.now()}`,
);
const intervalSeconds = parseFloat(getFlag(args, '--interval') || '0.5');
const maxFrames = parseInt(getFlag(args, '--max-frames') || '60', 10);
if (dryRun) {
return { status: 'success', command: 'detect-best', dryRun, videoPath, totalFramesExtracted: 0, productFrames: [], bestSnapshot: undefined };
}
const client = createSkillClient();
const visionConfig = await loadVisionConfig(client);
const frames = extractFrames(videoPath, outputDir, intervalSeconds, maxFrames);
if (frames.length === 0) {
return { status: 'failed', command: 'detect-best', dryRun, videoPath, error: 'no frames extracted from video' };
}
const best = await detectBestFrame(frames, visionConfig, 20);
return {
status: 'success',
command: 'detect-best',
dryRun,
videoPath,
totalFramesExtracted: frames.length,
productFrames: best ? [best] : [],
bestSnapshot: best ?? undefined,
};
}
async function runDetectBestAndSearch(args: string[], dryRun: boolean): Promise<OutputResult> {
const detectResult = await runDetectBest(args, dryRun) as DetectResult;
if (detectResult.status === 'failed') return detectResult;
if (!detectResult.bestSnapshot) {
if (dryRun) return { ...detectResult, command: 'detect-best-and-search' };
return { ...detectResult, status: 'failed', error: 'no frame could be extracted from video' };
}
const best = detectResult.bestSnapshot;
const imageForSearch = best.croppedImagePath || best.imagePath;
const searchResult = await runSearch([imageForSearch], dryRun) as SearchResult;
// Post-filter: drop results whose pic_url isn't the same product type as our snapshot
let postFilter: any = undefined;
if (!dryRun && searchResult.status === 'success' && searchResult.searchBody) {
const items: SearchItem[] = (searchResult.searchBody as any)?.data?.items?.item ?? [];
if (items.length > 0) {
try {
const client = createSkillClient();
const visionConfig = await loadVisionConfig(client);
const result = await postFilterByImage(imageForSearch, items, visionConfig, { description: best.description });
(searchResult.searchBody as any).data.items.item = result.kept;
postFilter = {
totalChecked: result.totalChecked,
keptCount: result.kept.length,
rejectedCount: result.rejected.length,
failed: result.failed,
};
} catch (e: any) {
postFilter = { error: e.message };
}
}
}
let rerankResult: any = undefined;
// If post-filter produced focused results, sort them directly by sales — they're already the best matches.
// Otherwise fall back to the keyword-intersection rerank.
if (!dryRun && postFilter && !postFilter.error && postFilter.keptCount > 0) {
const items: SearchItem[] = (searchResult.searchBody as any)?.data?.items?.item ?? [];
const sorted = [...items].sort((a, b) => (b.sales ?? 0) - (a.sales ?? 0)).slice(0, 10);
rerankResult = {
source: 'post-filter',
results: sorted,
count: sorted.length,
};
} else if (!dryRun && searchResult.status === 'success' && searchResult.searchBody) {
const tmpFile = path.join(path.dirname(imageForSearch), `search_body_${Date.now()}.json`);
try {
fs.writeFileSync(tmpFile, JSON.stringify(searchResult.searchBody));
rerankResult = await runRerank([
`--image-results=${tmpFile}`,
`--description=${best.description}`,
'--top=10',
], dryRun);
} catch (e: any) {
rerankResult = { error: e.message };
} finally {
try { fs.unlinkSync(tmpFile); } catch {}
}
}
return {
...detectResult,
command: 'detect-best-and-search',
searchHttpStatus: searchResult.searchHttpStatus,
searchBody: searchResult.searchBody,
searchError: searchResult.error,
postFilter,
rerank: rerankResult,
} as any;
}
async function runDetectVideo(args: string[], dryRun: boolean): Promise<DetectVideoResult> {
const videoPath = args[0];
if (!videoPath) return { status: 'failed', command: 'detect-video', dryRun, error: 'detect-video requires <video-path>' };
if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect-video', dryRun, error: `video not found: ${videoPath}` };
const detectResult = await runDetectBest(args, dryRun) as DetectResult;
if (detectResult.status === 'failed') {
return { status: 'failed', command: 'detect-video', dryRun, videoPath, error: detectResult.error || 'failed to detect best frame' };
}
const description = detectResult.bestSnapshot?.description?.trim();
const snapshotImagePath = detectResult.bestSnapshot?.croppedImagePath || detectResult.bestSnapshot?.imagePath;
if (!description) {
return { status: 'failed', command: 'detect-video', dryRun, videoPath, error: 'no product description detected from video' };
}
if (dryRun) {
return { status: 'success', command: 'detect-video', dryRun, videoPath, videoUrl: null, description, keyword: '<dry-run-keyword>', snapshotImagePath };
}
const client = createSkillClient();
const visionConfig = await loadVisionConfig(client);
const keyword = await generateChineseKeyword(description, visionConfig);
return { status: 'success', command: 'detect-video', dryRun, videoPath, videoUrl: null, description, keyword, snapshotImagePath };
}
async function runDetectVideoAndSearch(args: string[], dryRun: boolean): Promise<DetectVideoAndSearchResult> {
const videoPath = args[0];
if (!videoPath) return { status: 'failed', command: 'detect-video-and-search', dryRun, error: 'detect-video-and-search requires <video-path>' };
if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect-video-and-search', dryRun, error: `video not found: ${videoPath}` };
if (dryRun) {
return { status: 'success', command: 'detect-video-and-search', dryRun, videoPath, videoUrl: null, description: '<dry-run>', keyword: '<dry-run>', searchResults: [] };
}
// Reuse existing pipeline: best snapshot → image search → keyword rerank
const detectAndSearch = await runDetectBestAndSearch(args, dryRun) as any;
if (detectAndSearch.status === 'failed') {
return { status: 'failed', command: 'detect-video-and-search', dryRun, videoPath, error: detectAndSearch.error || 'detect-best-and-search failed' };
}
const description = String(detectAndSearch.bestSnapshot?.description || '').trim();
const rerank = detectAndSearch.rerank;
const keyword = String(rerank?.keyword || '').trim();
const searchResults = (rerank?.results || []) as SearchItem[];
// Fallback: if rerank didn't produce anything, do keyword search directly.
if (!searchResults.length) {
const client = createSkillClient();
const visionConfig = await loadVisionConfig(client);
const fallbackKeyword = keyword || (description ? await generateChineseKeyword(description, visionConfig) : '');
const items = fallbackKeyword ? await keywordSearch(client, fallbackKeyword, 1) : [];
return {
status: 'success',
command: 'detect-video-and-search',
dryRun,
videoPath,
videoUrl: null,
description,
keyword: fallbackKeyword,
searchResults: items,
};
}
return {
status: 'success',
command: 'detect-video-and-search',
dryRun,
videoPath,
videoUrl: null,
description,
keyword,
searchResults,
};
}
async function runDetectAndSearch(args: string[], dryRun: boolean): Promise<OutputResult> {
const detectResult = await runDetect(args, dryRun) as DetectResult;
if (detectResult.status === 'failed') return detectResult;
if (!detectResult.bestSnapshot) {
return { ...detectResult, status: 'failed', error: 'no product detected in video' };
}
const best = detectResult.bestSnapshot;
const imageForSearch = best.croppedImagePath || best.imagePath;
const searchResult = await runSearch([imageForSearch], dryRun) as SearchResult;
// Post-filter: drop results whose pic_url isn't the same product type as our snapshot
let postFilter: any = undefined;
if (!dryRun && searchResult.status === 'success' && searchResult.searchBody) {
const items: SearchItem[] = (searchResult.searchBody as any)?.data?.items?.item ?? [];
if (items.length > 0) {
try {
const client = createSkillClient();
const visionConfig = await loadVisionConfig(client);
const result = await postFilterByImage(imageForSearch, items, visionConfig, { description: best.description });
(searchResult.searchBody as any).data.items.item = result.kept;
postFilter = {
totalChecked: result.totalChecked,
keptCount: result.kept.length,
rejectedCount: result.rejected.length,
failed: result.failed,
};
} catch (e: any) {
postFilter = { error: e.message };
}
}
}
let rerankResult: any = undefined;
// If post-filter produced focused results, sort them directly by sales — they're already the best matches.
// Otherwise fall back to the keyword-intersection rerank.
if (!dryRun && postFilter && !postFilter.error && postFilter.keptCount > 0) {
const items: SearchItem[] = (searchResult.searchBody as any)?.data?.items?.item ?? [];
const sorted = [...items].sort((a, b) => (b.sales ?? 0) - (a.sales ?? 0)).slice(0, 10);
rerankResult = {
source: 'post-filter',
results: sorted,
count: sorted.length,
};
} else if (!dryRun && searchResult.status === 'success' && searchResult.searchBody) {
const tmpFile = path.join(path.dirname(imageForSearch), `search_body_${Date.now()}.json`);
try {
fs.writeFileSync(tmpFile, JSON.stringify(searchResult.searchBody));
rerankResult = await runRerank([
`--image-results=${tmpFile}`,
`--description=${best.description}`,
'--top=10',
], dryRun);
} catch (e: any) {
rerankResult = { error: e.message };
} finally {
try { fs.unlinkSync(tmpFile); } catch {}
}
}
return {
...detectResult,
command: 'detect-and-search',
searchHttpStatus: searchResult.searchHttpStatus,
searchBody: searchResult.searchBody,
searchError: searchResult.error,
postFilter,
rerank: rerankResult,
} as any;
}
function parseDetectOptions(videoPath: string, args: string[]): DetectOptions {
const outputDir = getFlag(args, '--output-dir') || path.join(
path.dirname(videoPath),
`snapshots_${path.basename(videoPath, path.extname(videoPath))}_${Date.now()}`,
);
return {
videoPath,
intervalSeconds: parseInt(getFlag(args, '--interval') || '1', 10),
maxFrames: parseInt(getFlag(args, '--max-frames') || '60', 10),
outputDir,
minConfidence: parseFloat(getFlag(args, '--min-confidence') || '0.7'),
concurrency: parseInt(getFlag(args, '--concurrency') || '5', 10),
};
}
function getFlag(args: string[], flag: string): string | undefined {
for (const arg of args) {
if (arg.startsWith(`${flag}=`)) return arg.slice(flag.length + 1);
}
return undefined;
}
function createVisionModel(config: VisionConfig) {
const headers: Record<string, string> = {
'x-langfuse-tags': 'skill:video-product-snapshot',
};
if (config.sessionId) {
headers['x-langfuse-session-id'] = config.sessionId;
}
const openai = createOpenAI({ apiKey: config.apiKey, baseURL: config.baseURL, headers });
return openai(config.model);
}
async function generateChineseKeyword(description: string, visionConfig: VisionConfig): Promise<string> {
const model = createVisionModel(visionConfig);
const { text } = await generateText({
model,
prompt: `You are generating a 1688.com (Chinese B2B wholesale) product search keyword.
Rules:
- Output ONLY 2-4 Chinese words — the product OBJECT TYPE + 1-2 key material/feature words
- CRITICAL: If the product is a container, organizer, rack, shelf, bag, box, or holder, the keyword MUST name THAT object — NOT the items it holds.
Examples: shoe rack → "金属鞋架", cable organizer → "理线器", storage shelf → "收纳架", toolbox → "工具箱"
- Use common Chinese commerce terms, NOT a literal translation
- No English, no punctuation, no explanation
- Short broad terms work better than long specific phrases
Product description: ${description}
Output only the search query:`,
});
return text.trim().replace(/[^\u4e00-\u9fff\u3400-\u4dbf]/g, '').trim();
}
async function keywordSearch(client: ReturnType<typeof createSkillClient>, keyword: string, page = 1): Promise<SearchItem[]> {
const res = await client.post('/ecom/tasks/keyword-search', { keyword, page });
const json = JSON.parse(res.body) as any;
return (json?.data?.items?.item ?? []) as SearchItem[];
}
function hasChinese(str: string): boolean {
return /[\u4e00-\u9fff]/.test(str);
}
function extractKeywordsFromTitles(items: SearchItem[], topN = 5): string {
// Pull the most repeated 2-char Chinese bigrams from top item titles as a fallback keyword
const freq: Record<string, number> = {};
for (const item of items.slice(0, topN)) {
const title = item.title || '';
for (let i = 0; i < title.length - 1; i++) {
const bigram = title.slice(i, i + 2);
if (/[\u4e00-\u9fff]{2}/.test(bigram)) {
freq[bigram] = (freq[bigram] || 0) + 1;
}
}
}
return Object.entries(freq)
.sort((a, b) => b[1] - a[1])
.slice(0, 3)
.map(([k]) => k)
.join('');
}
async function runRerank(args: string[], dryRun: boolean): Promise<OutputResult> {
// --image-results=<path> --keyword=<text> --top=<n>
const positionals = args.filter((a) => !a.startsWith('--'));
const imageResultsArg = getFlag(args, '--image-results') || positionals[0];
const keywordArg = getFlag(args, '--keyword') || positionals[1];
const topN = parseInt(getFlag(args, '--top') || '10', 10);
const description = getFlag(args, '--description') || '';
if (!imageResultsArg) return { status: 'failed', command: 'rerank', dryRun, error: 'rerank requires --image-results=<json-file>' };
if (dryRun) return { status: 'success', command: 'rerank', dryRun } as any;
const client = createSkillClient();
const visionConfig = await loadVisionConfig(client);
let imageItems: SearchItem[];
try {
const raw = fs.existsSync(imageResultsArg)
? fs.readFileSync(imageResultsArg, 'utf-8')
: imageResultsArg;
const parsed = JSON.parse(raw);
imageItems = parsed?.data?.items?.item ?? parsed?.items?.item ?? (Array.isArray(parsed) ? parsed : []);
} catch {
return { status: 'failed', command: 'rerank', dryRun, error: 'failed to parse image-results JSON' };
}
if (!imageItems.length) {
return { status: 'failed', command: 'rerank', dryRun, error: 'no items found in image-results JSON' };
}
// Determine Chinese keyword to use
let keyword = keywordArg || '';
let autoGeneratedKeyword = '';
if (!hasChinese(keyword)) {
// Prefer product description for accurate translation; fall back to image titles
const sourceText = description || keyword || extractKeywordsFromTitles(imageItems);
try {
autoGeneratedKeyword = await generateChineseKeyword(sourceText, visionConfig);
} catch {
autoGeneratedKeyword = extractKeywordsFromTitles(imageItems);
}
keyword = autoGeneratedKeyword;
}
// Keyword search on 1688
let keywordItems: SearchItem[] = [];
try {
keywordItems = await keywordSearch(client, keyword);
} catch (e: any) {
return { status: 'failed', command: 'rerank', dryRun, error: `keyword search failed: ${e.message}` };
}
// Intersect by num_iid
const keywordIds = new Set(keywordItems.map((i) => String(i.num_iid)));
const intersected = imageItems.filter((i) => keywordIds.has(String(i.num_iid)));
// If still no intersection, fall back to keyword results (at least they match the category)
const usedFallback = intersected.length === 0;
const results = usedFallback ? keywordItems : intersected;
// Sort by turn_head descending (click-through rate signal)
const sorted = results
.sort((a, b) => parseFloat(String(b.turn_head ?? '0')) - parseFloat(String(a.turn_head ?? '0')))
.slice(0, topN);
return {
status: 'success',
command: 'rerank',
dryRun,
keyword,
autoGeneratedKeyword: autoGeneratedKeyword || undefined,
imageResultsCount: imageItems.length,
keywordResultsCount: keywordItems.length,
intersectedCount: intersected.length,
usedFallback,
results: sorted,
} as any;
}