import * as fs from 'fs'; import * as path from 'path'; import type { Command, DetectOptions, DetectResult, SearchResult, OutputResult, SearchItem, DetectVideoResult, DetectVideoAndSearchResult } from './types.ts'; import { createSkillClient } from './auth-cli.ts'; import { extractFrames } from './frame-extractor.ts'; import { detectProductFrames, detectBestFrame } from './product-detector.ts'; import { postFilterByImage } from './post-filter.ts'; import { generateText } from 'ai'; import { createOpenAI } from '@ai-sdk/openai'; export interface VisionConfig { apiKey: string; baseURL?: string; model: string; } async function loadVisionConfig(client: ReturnType): Promise { const cfg = await client.clientConfig(); const apiKey = cfg.metadata?.provider?.api_key; if (!apiKey) throw new Error('Vision API key not found in client config (metadata.provider.api_key)'); return { apiKey, baseURL: cfg.metadata?.provider?.base_url, model: process.env.VISION_MODEL ?? cfg.metadata?.provider?.model ?? 'aliyun-cp-multimodal', }; } export async function run( command: Command, args: string[], dryRun: boolean, ): Promise { switch (command) { case 'session': return runSession(dryRun); case 'detect': return runDetect(args, dryRun); case 'search': return runSearch(args, dryRun); case 'detect-and-search': return runDetectAndSearch(args, dryRun); case 'detect-best': return runDetectBest(args, dryRun); case 'detect-best-and-search': return runDetectBestAndSearch(args, dryRun); case 'detect-video': return runDetectVideo(args, dryRun); case 'detect-video-and-search': return runDetectVideoAndSearch(args, dryRun); case 'rerank': return runRerank(args, dryRun); default: return { status: 'failed', command, dryRun, error: `unknown command: ${command}` }; } } async function runSession(dryRun: boolean): Promise { const client = createSkillClient({ dryRun }); const session = await client.session(); return { status: 'success', command: 'session', dryRun, ...session } as any; } async function runDetect(args: string[], dryRun: boolean): Promise { const videoPath = args[0]; if (!videoPath) return { status: 'failed', command: 'detect', dryRun, error: 'detect requires ' }; if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect', dryRun, error: `video not found: ${videoPath}` }; const opts = parseDetectOptions(videoPath, args); if (dryRun) { return { status: 'success', command: 'detect', dryRun, videoPath, totalFramesExtracted: 0, productFrames: [], bestSnapshot: undefined, }; } const client = createSkillClient(); const visionConfig = await loadVisionConfig(client); const frames = extractFrames(videoPath, opts.outputDir, opts.intervalSeconds, opts.maxFrames); const productFrames = await detectProductFrames(frames, opts.minConfidence, opts.concurrency, visionConfig); return { status: 'success', command: 'detect', dryRun, videoPath, totalFramesExtracted: frames.length, productFrames, bestSnapshot: productFrames[0], }; } async function uploadImage(client: ReturnType, imagePath: string): Promise { const imageBuffer = fs.readFileSync(imagePath); const filename = `video-snapshot-${Date.now()}.jpg`; const res = await client.post('/ecom/tasks/upload-image', { data: imageBuffer.toString('base64'), filename, contentType: 'image/jpeg', }); if (res.status >= 400) throw new Error(`Upload failed: HTTP ${res.status}`); const json = JSON.parse(res.body) as { url?: string }; if (!json.url) throw new Error('Upload response missing url'); return json.url; } async function runSearch(args: string[], dryRun: boolean): Promise { const imagePath = args[0]; if (!imagePath) return { status: 'failed', command: 'search', dryRun, error: 'search requires ' }; if (!fs.existsSync(imagePath)) return { status: 'failed', command: 'search', dryRun, error: `image not found: ${imagePath}` }; if (dryRun) { return { status: 'success', command: 'search', dryRun, imagePath, searchHttpStatus: 0, searchBody: null }; } const client = createSkillClient(); let imgid = imagePath; if (!imagePath.startsWith('http')) { imgid = await uploadImage(client, imagePath); } const res = await client.post('/ecom/tasks/search-by-image', { imgid, page: 1 }); const searchHttpStatus = res.status; const body = JSON.parse(res.body); if (res.status >= 400) { return { status: 'failed', command: 'search', dryRun, imagePath, searchHttpStatus, error: JSON.stringify(body) }; } return { status: 'success', command: 'search', dryRun, imagePath, searchHttpStatus, searchBody: body }; } async function runDetectBest(args: string[], dryRun: boolean): Promise { const videoPath = args[0]; if (!videoPath) return { status: 'failed', command: 'detect-best', dryRun, error: 'detect-best requires ' }; if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect-best', dryRun, error: `video not found: ${videoPath}` }; const outputDir = getFlag(args, '--output-dir') || path.join( path.dirname(videoPath), `snapshots_${path.basename(videoPath, path.extname(videoPath))}_${Date.now()}`, ); const intervalSeconds = parseFloat(getFlag(args, '--interval') || '0.5'); const maxFrames = parseInt(getFlag(args, '--max-frames') || '60', 10); if (dryRun) { return { status: 'success', command: 'detect-best', dryRun, videoPath, totalFramesExtracted: 0, productFrames: [], bestSnapshot: undefined }; } const client = createSkillClient(); const visionConfig = await loadVisionConfig(client); const frames = extractFrames(videoPath, outputDir, intervalSeconds, maxFrames); if (frames.length === 0) { return { status: 'failed', command: 'detect-best', dryRun, videoPath, error: 'no frames extracted from video' }; } const best = await detectBestFrame(frames, visionConfig, 20); return { status: 'success', command: 'detect-best', dryRun, videoPath, totalFramesExtracted: frames.length, productFrames: best ? [best] : [], bestSnapshot: best ?? undefined, }; } async function runDetectBestAndSearch(args: string[], dryRun: boolean): Promise { const detectResult = await runDetectBest(args, dryRun) as DetectResult; if (detectResult.status === 'failed') return detectResult; if (!detectResult.bestSnapshot) { if (dryRun) return { ...detectResult, command: 'detect-best-and-search' }; return { ...detectResult, status: 'failed', error: 'no frame could be extracted from video' }; } const best = detectResult.bestSnapshot; const imageForSearch = best.croppedImagePath || best.imagePath; const searchResult = await runSearch([imageForSearch], dryRun) as SearchResult; // Post-filter: drop results whose pic_url isn't the same product type as our snapshot let postFilter: any = undefined; if (!dryRun && searchResult.status === 'success' && searchResult.searchBody) { const items: SearchItem[] = (searchResult.searchBody as any)?.data?.items?.item ?? []; if (items.length > 0) { try { const client = createSkillClient(); const visionConfig = await loadVisionConfig(client); const result = await postFilterByImage(imageForSearch, items, visionConfig, { description: best.description }); (searchResult.searchBody as any).data.items.item = result.kept; postFilter = { totalChecked: result.totalChecked, keptCount: result.kept.length, rejectedCount: result.rejected.length, failed: result.failed, }; } catch (e: any) { postFilter = { error: e.message }; } } } let rerankResult: any = undefined; // If post-filter produced focused results, sort them directly by sales — they're already the best matches. // Otherwise fall back to the keyword-intersection rerank. if (!dryRun && postFilter && !postFilter.error && postFilter.keptCount > 0) { const items: SearchItem[] = (searchResult.searchBody as any)?.data?.items?.item ?? []; const sorted = [...items].sort((a, b) => (b.sales ?? 0) - (a.sales ?? 0)).slice(0, 10); rerankResult = { source: 'post-filter', results: sorted, count: sorted.length, }; } else if (!dryRun && searchResult.status === 'success' && searchResult.searchBody) { const tmpFile = path.join(path.dirname(imageForSearch), `search_body_${Date.now()}.json`); try { fs.writeFileSync(tmpFile, JSON.stringify(searchResult.searchBody)); rerankResult = await runRerank([ `--image-results=${tmpFile}`, `--description=${best.description}`, '--top=10', ], dryRun); } catch (e: any) { rerankResult = { error: e.message }; } finally { try { fs.unlinkSync(tmpFile); } catch {} } } return { ...detectResult, command: 'detect-best-and-search', searchHttpStatus: searchResult.searchHttpStatus, searchBody: searchResult.searchBody, searchError: searchResult.error, postFilter, rerank: rerankResult, } as any; } async function runDetectVideo(args: string[], dryRun: boolean): Promise { const videoPath = args[0]; if (!videoPath) return { status: 'failed', command: 'detect-video', dryRun, error: 'detect-video requires ' }; if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect-video', dryRun, error: `video not found: ${videoPath}` }; const detectResult = await runDetectBest(args, dryRun) as DetectResult; if (detectResult.status === 'failed') { return { status: 'failed', command: 'detect-video', dryRun, videoPath, error: detectResult.error || 'failed to detect best frame' }; } const description = detectResult.bestSnapshot?.description?.trim(); const snapshotImagePath = detectResult.bestSnapshot?.croppedImagePath || detectResult.bestSnapshot?.imagePath; if (!description) { return { status: 'failed', command: 'detect-video', dryRun, videoPath, error: 'no product description detected from video' }; } if (dryRun) { return { status: 'success', command: 'detect-video', dryRun, videoPath, videoUrl: null, description, keyword: '', snapshotImagePath }; } const client = createSkillClient(); const visionConfig = await loadVisionConfig(client); const keyword = await generateChineseKeyword(description, visionConfig); return { status: 'success', command: 'detect-video', dryRun, videoPath, videoUrl: null, description, keyword, snapshotImagePath }; } async function runDetectVideoAndSearch(args: string[], dryRun: boolean): Promise { const videoPath = args[0]; if (!videoPath) return { status: 'failed', command: 'detect-video-and-search', dryRun, error: 'detect-video-and-search requires ' }; if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect-video-and-search', dryRun, error: `video not found: ${videoPath}` }; if (dryRun) { return { status: 'success', command: 'detect-video-and-search', dryRun, videoPath, videoUrl: null, description: '', keyword: '', searchResults: [] }; } // Reuse existing pipeline: best snapshot → image search → keyword rerank const detectAndSearch = await runDetectBestAndSearch(args, dryRun) as any; if (detectAndSearch.status === 'failed') { return { status: 'failed', command: 'detect-video-and-search', dryRun, videoPath, error: detectAndSearch.error || 'detect-best-and-search failed' }; } const description = String(detectAndSearch.bestSnapshot?.description || '').trim(); const rerank = detectAndSearch.rerank; const keyword = String(rerank?.keyword || '').trim(); const searchResults = (rerank?.results || []) as SearchItem[]; // Fallback: if rerank didn't produce anything, do keyword search directly. if (!searchResults.length) { const client = createSkillClient(); const visionConfig = await loadVisionConfig(client); const fallbackKeyword = keyword || (description ? await generateChineseKeyword(description, visionConfig) : ''); const items = fallbackKeyword ? await keywordSearch(client, fallbackKeyword, 1) : []; return { status: 'success', command: 'detect-video-and-search', dryRun, videoPath, videoUrl: null, description, keyword: fallbackKeyword, searchResults: items, }; } return { status: 'success', command: 'detect-video-and-search', dryRun, videoPath, videoUrl: null, description, keyword, searchResults, }; } async function runDetectAndSearch(args: string[], dryRun: boolean): Promise { const detectResult = await runDetect(args, dryRun) as DetectResult; if (detectResult.status === 'failed') return detectResult; if (!detectResult.bestSnapshot) { return { ...detectResult, status: 'failed', error: 'no product detected in video' }; } const best = detectResult.bestSnapshot; const imageForSearch = best.croppedImagePath || best.imagePath; const searchResult = await runSearch([imageForSearch], dryRun) as SearchResult; // Post-filter: drop results whose pic_url isn't the same product type as our snapshot let postFilter: any = undefined; if (!dryRun && searchResult.status === 'success' && searchResult.searchBody) { const items: SearchItem[] = (searchResult.searchBody as any)?.data?.items?.item ?? []; if (items.length > 0) { try { const client = createSkillClient(); const visionConfig = await loadVisionConfig(client); const result = await postFilterByImage(imageForSearch, items, visionConfig, { description: best.description }); (searchResult.searchBody as any).data.items.item = result.kept; postFilter = { totalChecked: result.totalChecked, keptCount: result.kept.length, rejectedCount: result.rejected.length, failed: result.failed, }; } catch (e: any) { postFilter = { error: e.message }; } } } let rerankResult: any = undefined; // If post-filter produced focused results, sort them directly by sales — they're already the best matches. // Otherwise fall back to the keyword-intersection rerank. if (!dryRun && postFilter && !postFilter.error && postFilter.keptCount > 0) { const items: SearchItem[] = (searchResult.searchBody as any)?.data?.items?.item ?? []; const sorted = [...items].sort((a, b) => (b.sales ?? 0) - (a.sales ?? 0)).slice(0, 10); rerankResult = { source: 'post-filter', results: sorted, count: sorted.length, }; } else if (!dryRun && searchResult.status === 'success' && searchResult.searchBody) { const tmpFile = path.join(path.dirname(imageForSearch), `search_body_${Date.now()}.json`); try { fs.writeFileSync(tmpFile, JSON.stringify(searchResult.searchBody)); rerankResult = await runRerank([ `--image-results=${tmpFile}`, `--description=${best.description}`, '--top=10', ], dryRun); } catch (e: any) { rerankResult = { error: e.message }; } finally { try { fs.unlinkSync(tmpFile); } catch {} } } return { ...detectResult, command: 'detect-and-search', searchHttpStatus: searchResult.searchHttpStatus, searchBody: searchResult.searchBody, searchError: searchResult.error, postFilter, rerank: rerankResult, } as any; } function parseDetectOptions(videoPath: string, args: string[]): DetectOptions { const outputDir = getFlag(args, '--output-dir') || path.join( path.dirname(videoPath), `snapshots_${path.basename(videoPath, path.extname(videoPath))}_${Date.now()}`, ); return { videoPath, intervalSeconds: parseInt(getFlag(args, '--interval') || '1', 10), maxFrames: parseInt(getFlag(args, '--max-frames') || '60', 10), outputDir, minConfidence: parseFloat(getFlag(args, '--min-confidence') || '0.7'), concurrency: parseInt(getFlag(args, '--concurrency') || '5', 10), }; } function getFlag(args: string[], flag: string): string | undefined { for (const arg of args) { if (arg.startsWith(`${flag}=`)) return arg.slice(flag.length + 1); } return undefined; } function createVisionModel(config: VisionConfig) { const openai = createOpenAI({ apiKey: config.apiKey, baseURL: config.baseURL }); return openai(config.model); } async function generateChineseKeyword(description: string, visionConfig: VisionConfig): Promise { const model = createVisionModel(visionConfig); const { text } = await generateText({ model, prompt: `You are generating a 1688.com (Chinese B2B wholesale) product search keyword. Rules: - Output ONLY 2-4 Chinese words — the product OBJECT TYPE + 1-2 key material/feature words - CRITICAL: If the product is a container, organizer, rack, shelf, bag, box, or holder, the keyword MUST name THAT object — NOT the items it holds. Examples: shoe rack → "金属鞋架", cable organizer → "理线器", storage shelf → "收纳架", toolbox → "工具箱" - Use common Chinese commerce terms, NOT a literal translation - No English, no punctuation, no explanation - Short broad terms work better than long specific phrases Product description: ${description} Output only the search query:`, }); return text.trim().replace(/[^\u4e00-\u9fff\u3400-\u4dbf]/g, '').trim(); } async function keywordSearch(client: ReturnType, keyword: string, page = 1): Promise { const res = await client.post('/ecom/tasks/keyword-search', { keyword, page }); const json = JSON.parse(res.body) as any; return (json?.data?.items?.item ?? []) as SearchItem[]; } function hasChinese(str: string): boolean { return /[\u4e00-\u9fff]/.test(str); } function extractKeywordsFromTitles(items: SearchItem[], topN = 5): string { // Pull the most repeated 2-char Chinese bigrams from top item titles as a fallback keyword const freq: Record = {}; for (const item of items.slice(0, topN)) { const title = item.title || ''; for (let i = 0; i < title.length - 1; i++) { const bigram = title.slice(i, i + 2); if (/[\u4e00-\u9fff]{2}/.test(bigram)) { freq[bigram] = (freq[bigram] || 0) + 1; } } } return Object.entries(freq) .sort((a, b) => b[1] - a[1]) .slice(0, 3) .map(([k]) => k) .join(''); } async function runRerank(args: string[], dryRun: boolean): Promise { // --image-results= --keyword= --top= const positionals = args.filter((a) => !a.startsWith('--')); const imageResultsArg = getFlag(args, '--image-results') || positionals[0]; const keywordArg = getFlag(args, '--keyword') || positionals[1]; const topN = parseInt(getFlag(args, '--top') || '10', 10); const description = getFlag(args, '--description') || ''; if (!imageResultsArg) return { status: 'failed', command: 'rerank', dryRun, error: 'rerank requires --image-results=' }; if (dryRun) return { status: 'success', command: 'rerank', dryRun } as any; const client = createSkillClient(); const visionConfig = await loadVisionConfig(client); let imageItems: SearchItem[]; try { const raw = fs.existsSync(imageResultsArg) ? fs.readFileSync(imageResultsArg, 'utf-8') : imageResultsArg; const parsed = JSON.parse(raw); imageItems = parsed?.data?.items?.item ?? parsed?.items?.item ?? (Array.isArray(parsed) ? parsed : []); } catch { return { status: 'failed', command: 'rerank', dryRun, error: 'failed to parse image-results JSON' }; } if (!imageItems.length) { return { status: 'failed', command: 'rerank', dryRun, error: 'no items found in image-results JSON' }; } // Determine Chinese keyword to use let keyword = keywordArg || ''; let autoGeneratedKeyword = ''; if (!hasChinese(keyword)) { // Prefer product description for accurate translation; fall back to image titles const sourceText = description || keyword || extractKeywordsFromTitles(imageItems); try { autoGeneratedKeyword = await generateChineseKeyword(sourceText, visionConfig); } catch { autoGeneratedKeyword = extractKeywordsFromTitles(imageItems); } keyword = autoGeneratedKeyword; } // Keyword search on 1688 let keywordItems: SearchItem[] = []; try { keywordItems = await keywordSearch(client, keyword); } catch (e: any) { return { status: 'failed', command: 'rerank', dryRun, error: `keyword search failed: ${e.message}` }; } // Intersect by num_iid const keywordIds = new Set(keywordItems.map((i) => String(i.num_iid))); const intersected = imageItems.filter((i) => keywordIds.has(String(i.num_iid))); // If still no intersection, fall back to keyword results (at least they match the category) const usedFallback = intersected.length === 0; const results = usedFallback ? keywordItems : intersected; // Sort by turn_head descending (click-through rate signal) const sorted = results .sort((a, b) => parseFloat(String(b.turn_head ?? '0')) - parseFloat(String(a.turn_head ?? '0'))) .slice(0, topN); return { status: 'success', command: 'rerank', dryRun, keyword, autoGeneratedKeyword: autoGeneratedKeyword || undefined, imageResultsCount: imageItems.length, keywordResultsCount: keywordItems.length, intersectedCount: intersected.length, usedFallback, results: sorted, } as any; }