commit cabd1b332a5a3d60270168850aee6b3e1d59b610 Author: ywkj Date: Mon Apr 20 07:24:28 2026 +0800 feat: 初始化 video-product-snapshot skill 视频商品检测 + 1688 以图搜图 + 关键词二次过滤完整流程。 Co-Authored-By: Claude Sonnet 4.6 diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..a10211e Binary files /dev/null and b/.DS_Store differ diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..e4eb4ee --- /dev/null +++ b/.env.example @@ -0,0 +1,40 @@ +# ============================================================================= +# video-product-snapshot 环境变量配置 +# 复制为 .env 并填入真实值:cp .env.example .env +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Vision API 配置(用于商品帧检测) +# 兼容任何 OpenAI 格式接口:OpenAI / Groq / Together / 本地 Ollama 等 +# ----------------------------------------------------------------------------- + +# API Key(必填) +VISION_API_KEY=your-api-key-here + +# API Base URL(可选,留空则使用 OpenAI 官方地址) +# VISION_API_BASE=https://api.groq.com/openai/v1 +# VISION_API_BASE=http://localhost:11434/v1 + +# 模型名称(可选,默认 gpt-4o-mini) +# VISION_MODEL=gpt-4o-mini +# VISION_MODEL=meta-llama/llama-4-scout-17b-16e-instruct +# VISION_MODEL=llava:13b + +# ----------------------------------------------------------------------------- +# 1688 图搜配置(via woo-data-scrawler 本地服务,端口 3202) +# 所有 Onebound 调用均通过本地服务代理,无需持有 API 密钥 +# ----------------------------------------------------------------------------- + +# 上传图片接口(将本地图片上传到公共存储,获取可访问 URL) +ONEBOUND_UPLOAD_ENDPOINT=http://localhost:3202/api/v1/tasks/upload-image + +# 以图搜图接口 +ONEBOUND_SEARCH_ENDPOINT=http://localhost:3202/api/v1/tasks/search-by-image + +# 关键词搜索接口(用于 rerank 二次过滤) +ONEBOUND_KEYWORD_SEARCH_ENDPOINT=http://localhost:3202/api/v1/tasks/keyword-search + +# ----------------------------------------------------------------------------- +# Auth(由 auth-rt 自动处理,配置见 ~/.openclaw/.env) +# 只需在 ~/.openclaw/.env 中设置 CLIENT_KEY=sk_xxx +# ----------------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2ebe857 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +node_modules/ +dist/ +.env +snapshots_*/ +*.jpg +*.mp4 +*.png diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..24b36c2 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,94 @@ +--- +name: video-product-snapshot +description: "Detect ecommerce products in video frames using Claude Vision, extract the best product snapshot, and optionally search via image-search API. Use when the user provides a video and wants to find/identify products shown in it." +--- + +# Video Product Snapshot + +Extract ecommerce product snapshots from video using Claude Vision, then optionally search for matching products via image-search API. + +## Run + +```bash +bun dist/run.js [args] [--dry-run] +``` + +## Commands + +| Command | Description | +|---------|-------------| +| `detect [options]` | Extract frames, detect product snapshots | +| `search ` | Search products by image via API | +| `detect-and-search [options]` | Detect best snapshot then run image search | +| `session` | Get auth session token | + +## Options for `detect` / `detect-and-search` + +| Flag | Default | Description | +|------|---------|-------------| +| `--interval=` | `1` | Seconds between sampled frames | +| `--max-frames=` | `60` | Max frames to analyze | +| `--output-dir=` | next to video | Directory to save snapshot images | +| `--min-confidence=<0-1>` | `0.7` | Minimum detection confidence threshold | + +## Examples + +```bash +# Detect product frames in a video +bun dist/run.js detect ./product-demo.mp4 + +# Sample every 5 seconds, higher confidence threshold +bun dist/run.js detect ./product-demo.mp4 --interval=5 --min-confidence=0.85 + +# Search for products using an existing image +bun dist/run.js search ./snapshot.jpg + +# Full pipeline: detect best product frame then search +bun dist/run.js detect-and-search ./product-demo.mp4 --interval=3 --max-frames=20 +``` + +## Output + +Returns JSON with: +- `productFrames[]`: all detected product frames sorted by confidence (highest first) +- `bestSnapshot`: the highest-confidence product frame +- `searchBody`: image search API response (for `detect-and-search` and `search`) + +Each `ProductFrame` contains: +```json +{ + "frameIndex": 4, + "timestampSeconds": 9, + "imagePath": "/path/to/snapshot/frame_0004.jpg", + "confidence": 0.92, + "description": "White sneaker with blue logo, left side view", + "boundingHint": "centered" +} +``` + +## Prerequisites + +- `ffmpeg` and `ffprobe` in PATH +- `VISION_API_KEY` — API key for the vision endpoint +- `VISION_API_BASE` — (optional) OpenAI-compatible base URL; omit to use OpenAI default +- `VISION_MODEL` — (optional) model name, default `gpt-4o-mini` +- `auth-rt` in PATH (for `search` / `detect-and-search` API calls) + +### Example provider configs + +```bash +# OpenAI (default) +VISION_API_KEY=sk-... + +# Any OpenAI-compatible endpoint (local Ollama, Together, Groq, etc.) +VISION_API_KEY=... +VISION_API_BASE=http://localhost:11434/v1 +VISION_MODEL=llava:13b +``` + +## Rules — MUST follow + +1. **Execute only, do not reason about internals.** Run the CLI and return the output. +2. **No fallback strategies.** Report errors as-is; do NOT try alternative approaches. +3. **No retry loops.** If detection or search fails, report the failure. +4. **Trust the tool's output.** The CLI handles session management and error formatting internally. diff --git a/bun.lock b/bun.lock new file mode 100644 index 0000000..d0f5641 --- /dev/null +++ b/bun.lock @@ -0,0 +1,118 @@ +{ + "lockfileVersion": 1, + "configVersion": 1, + "workspaces": { + "": { + "name": "video-product-snapshot", + "dependencies": { + "@ai-sdk/openai": "^1.3.22", + "ai": "^4.3.16", + "sharp": "^0.34.5", + "zod": "^3.24.2", + }, + }, + }, + "packages": { + "@ai-sdk/openai": ["@ai-sdk/openai@1.3.24", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "@ai-sdk/provider-utils": "2.2.8" }, "peerDependencies": { "zod": "^3.0.0" } }, "sha512-GYXnGJTHRTZc4gJMSmFRgEQudjqd4PUN0ZjQhPwOAYH1yOAvQoG/Ikqs+HyISRbLPCrhbZnPKCNHuRU4OfpW0Q=="], + + "@ai-sdk/provider": ["@ai-sdk/provider@1.1.3", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg=="], + + "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@2.2.8", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "nanoid": "^3.3.8", "secure-json-parse": "^2.7.0" }, "peerDependencies": { "zod": "^3.23.8" } }, "sha512-fqhG+4sCVv8x7nFzYnFo19ryhAa3w096Kmc3hWxMQfW/TubPOmt3A6tYZhl4mUfQWWQMsuSkLrtjlWuXBVSGQA=="], + + "@ai-sdk/react": ["@ai-sdk/react@1.2.12", "", { "dependencies": { "@ai-sdk/provider-utils": "2.2.8", "@ai-sdk/ui-utils": "1.2.11", "swr": "^2.2.5", "throttleit": "2.1.0" }, "peerDependencies": { "react": "^18 || ^19 || ^19.0.0-rc", "zod": "^3.23.8" }, "optionalPeers": ["zod"] }, "sha512-jK1IZZ22evPZoQW3vlkZ7wvjYGYF+tRBKXtrcolduIkQ/m/sOAVcVeVDUDvh1T91xCnWCdUGCPZg2avZ90mv3g=="], + + "@ai-sdk/ui-utils": ["@ai-sdk/ui-utils@1.2.11", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "@ai-sdk/provider-utils": "2.2.8", "zod-to-json-schema": "^3.24.1" }, "peerDependencies": { "zod": "^3.23.8" } }, "sha512-3zcwCc8ezzFlwp3ZD15wAPjf2Au4s3vAbKsXQVyhxODHcmu0iyPO2Eua6D/vicq/AUm/BAo60r97O6HU+EI0+w=="], + + "@emnapi/runtime": ["@emnapi/runtime@1.10.0", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA=="], + + "@img/colour": ["@img/colour@1.1.0", "", {}, "sha512-Td76q7j57o/tLVdgS746cYARfSyxk8iEfRxewL9h4OMzYhbW4TAcppl0mT4eyqXddh6L/jwoM75mo7ixa/pCeQ=="], + + "@img/sharp-darwin-arm64": ["@img/sharp-darwin-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-arm64": "1.2.4" }, "os": "darwin", "cpu": "arm64" }, "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w=="], + + "@img/sharp-darwin-x64": ["@img/sharp-darwin-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-x64": "1.2.4" }, "os": "darwin", "cpu": "x64" }, "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw=="], + + "@img/sharp-libvips-darwin-arm64": ["@img/sharp-libvips-darwin-arm64@1.2.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g=="], + + "@img/sharp-libvips-darwin-x64": ["@img/sharp-libvips-darwin-x64@1.2.4", "", { "os": "darwin", "cpu": "x64" }, "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg=="], + + "@img/sharp-libvips-linux-arm": ["@img/sharp-libvips-linux-arm@1.2.4", "", { "os": "linux", "cpu": "arm" }, "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A=="], + + "@img/sharp-libvips-linux-arm64": ["@img/sharp-libvips-linux-arm64@1.2.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw=="], + + "@img/sharp-libvips-linux-ppc64": ["@img/sharp-libvips-linux-ppc64@1.2.4", "", { "os": "linux", "cpu": "ppc64" }, "sha512-FMuvGijLDYG6lW+b/UvyilUWu5Ayu+3r2d1S8notiGCIyYU/76eig1UfMmkZ7vwgOrzKzlQbFSuQfgm7GYUPpA=="], + + "@img/sharp-libvips-linux-riscv64": ["@img/sharp-libvips-linux-riscv64@1.2.4", "", { "os": "linux", "cpu": "none" }, "sha512-oVDbcR4zUC0ce82teubSm+x6ETixtKZBh/qbREIOcI3cULzDyb18Sr/Wcyx7NRQeQzOiHTNbZFF1UwPS2scyGA=="], + + "@img/sharp-libvips-linux-s390x": ["@img/sharp-libvips-linux-s390x@1.2.4", "", { "os": "linux", "cpu": "s390x" }, "sha512-qmp9VrzgPgMoGZyPvrQHqk02uyjA0/QrTO26Tqk6l4ZV0MPWIW6LTkqOIov+J1yEu7MbFQaDpwdwJKhbJvuRxQ=="], + + "@img/sharp-libvips-linux-x64": ["@img/sharp-libvips-linux-x64@1.2.4", "", { "os": "linux", "cpu": "x64" }, "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw=="], + + "@img/sharp-libvips-linuxmusl-arm64": ["@img/sharp-libvips-linuxmusl-arm64@1.2.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw=="], + + "@img/sharp-libvips-linuxmusl-x64": ["@img/sharp-libvips-linuxmusl-x64@1.2.4", "", { "os": "linux", "cpu": "x64" }, "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg=="], + + "@img/sharp-linux-arm": ["@img/sharp-linux-arm@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm": "1.2.4" }, "os": "linux", "cpu": "arm" }, "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw=="], + + "@img/sharp-linux-arm64": ["@img/sharp-linux-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm64": "1.2.4" }, "os": "linux", "cpu": "arm64" }, "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg=="], + + "@img/sharp-linux-ppc64": ["@img/sharp-linux-ppc64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-ppc64": "1.2.4" }, "os": "linux", "cpu": "ppc64" }, "sha512-7zznwNaqW6YtsfrGGDA6BRkISKAAE1Jo0QdpNYXNMHu2+0dTrPflTLNkpc8l7MUP5M16ZJcUvysVWWrMefZquA=="], + + "@img/sharp-linux-riscv64": ["@img/sharp-linux-riscv64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-riscv64": "1.2.4" }, "os": "linux", "cpu": "none" }, "sha512-51gJuLPTKa7piYPaVs8GmByo7/U7/7TZOq+cnXJIHZKavIRHAP77e3N2HEl3dgiqdD/w0yUfiJnII77PuDDFdw=="], + + "@img/sharp-linux-s390x": ["@img/sharp-linux-s390x@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-s390x": "1.2.4" }, "os": "linux", "cpu": "s390x" }, "sha512-nQtCk0PdKfho3eC5MrbQoigJ2gd1CgddUMkabUj+rBevs8tZ2cULOx46E7oyX+04WGfABgIwmMC0VqieTiR4jg=="], + + "@img/sharp-linux-x64": ["@img/sharp-linux-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-x64": "1.2.4" }, "os": "linux", "cpu": "x64" }, "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ=="], + + "@img/sharp-linuxmusl-arm64": ["@img/sharp-linuxmusl-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-arm64": "1.2.4" }, "os": "linux", "cpu": "arm64" }, "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg=="], + + "@img/sharp-linuxmusl-x64": ["@img/sharp-linuxmusl-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-x64": "1.2.4" }, "os": "linux", "cpu": "x64" }, "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q=="], + + "@img/sharp-wasm32": ["@img/sharp-wasm32@0.34.5", "", { "dependencies": { "@emnapi/runtime": "^1.7.0" }, "cpu": "none" }, "sha512-OdWTEiVkY2PHwqkbBI8frFxQQFekHaSSkUIJkwzclWZe64O1X4UlUjqqqLaPbUpMOQk6FBu/HtlGXNblIs0huw=="], + + "@img/sharp-win32-arm64": ["@img/sharp-win32-arm64@0.34.5", "", { "os": "win32", "cpu": "arm64" }, "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g=="], + + "@img/sharp-win32-ia32": ["@img/sharp-win32-ia32@0.34.5", "", { "os": "win32", "cpu": "ia32" }, "sha512-FV9m/7NmeCmSHDD5j4+4pNI8Cp3aW+JvLoXcTUo0IqyjSfAZJ8dIUmijx1qaJsIiU+Hosw6xM5KijAWRJCSgNg=="], + + "@img/sharp-win32-x64": ["@img/sharp-win32-x64@0.34.5", "", { "os": "win32", "cpu": "x64" }, "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw=="], + + "@opentelemetry/api": ["@opentelemetry/api@1.9.0", "", {}, "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg=="], + + "@types/diff-match-patch": ["@types/diff-match-patch@1.0.36", "", {}, "sha512-xFdR6tkm0MWvBfO8xXCSsinYxHcqkQUlcHeSpMC2ukzOb6lwQAfDmW+Qt0AvlGd8HpsS28qKsB+oPeJn9I39jg=="], + + "ai": ["ai@4.3.19", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "@ai-sdk/provider-utils": "2.2.8", "@ai-sdk/react": "1.2.12", "@ai-sdk/ui-utils": "1.2.11", "@opentelemetry/api": "1.9.0", "jsondiffpatch": "0.6.0" }, "peerDependencies": { "react": "^18 || ^19 || ^19.0.0-rc", "zod": "^3.23.8" }, "optionalPeers": ["react"] }, "sha512-dIE2bfNpqHN3r6IINp9znguYdhIOheKW2LDigAMrgt/upT3B8eBGPSCblENvaZGoq+hxaN9fSMzjWpbqloP+7Q=="], + + "chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="], + + "dequal": ["dequal@2.0.3", "", {}, "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA=="], + + "detect-libc": ["detect-libc@2.1.2", "", {}, "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ=="], + + "diff-match-patch": ["diff-match-patch@1.0.5", "", {}, "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw=="], + + "json-schema": ["json-schema@0.4.0", "", {}, "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="], + + "jsondiffpatch": ["jsondiffpatch@0.6.0", "", { "dependencies": { "@types/diff-match-patch": "^1.0.36", "chalk": "^5.3.0", "diff-match-patch": "^1.0.5" }, "bin": { "jsondiffpatch": "bin/jsondiffpatch.js" } }, "sha512-3QItJOXp2AP1uv7waBkao5nCvhEv+QmJAd38Ybq7wNI74Q+BBmnLn4EDKz6yI9xGAIQoUF87qHt+kc1IVxB4zQ=="], + + "nanoid": ["nanoid@3.3.11", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w=="], + + "react": ["react@19.2.5", "", {}, "sha512-llUJLzz1zTUBrskt2pwZgLq59AemifIftw4aB7JxOqf1HY2FDaGDxgwpAPVzHU1kdWabH7FauP4i1oEeer2WCA=="], + + "secure-json-parse": ["secure-json-parse@2.7.0", "", {}, "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw=="], + + "semver": ["semver@7.7.4", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA=="], + + "sharp": ["sharp@0.34.5", "", { "dependencies": { "@img/colour": "^1.0.0", "detect-libc": "^2.1.2", "semver": "^7.7.3" }, "optionalDependencies": { "@img/sharp-darwin-arm64": "0.34.5", "@img/sharp-darwin-x64": "0.34.5", "@img/sharp-libvips-darwin-arm64": "1.2.4", "@img/sharp-libvips-darwin-x64": "1.2.4", "@img/sharp-libvips-linux-arm": "1.2.4", "@img/sharp-libvips-linux-arm64": "1.2.4", "@img/sharp-libvips-linux-ppc64": "1.2.4", "@img/sharp-libvips-linux-riscv64": "1.2.4", "@img/sharp-libvips-linux-s390x": "1.2.4", "@img/sharp-libvips-linux-x64": "1.2.4", "@img/sharp-libvips-linuxmusl-arm64": "1.2.4", "@img/sharp-libvips-linuxmusl-x64": "1.2.4", "@img/sharp-linux-arm": "0.34.5", "@img/sharp-linux-arm64": "0.34.5", "@img/sharp-linux-ppc64": "0.34.5", "@img/sharp-linux-riscv64": "0.34.5", "@img/sharp-linux-s390x": "0.34.5", "@img/sharp-linux-x64": "0.34.5", "@img/sharp-linuxmusl-arm64": "0.34.5", "@img/sharp-linuxmusl-x64": "0.34.5", "@img/sharp-wasm32": "0.34.5", "@img/sharp-win32-arm64": "0.34.5", "@img/sharp-win32-ia32": "0.34.5", "@img/sharp-win32-x64": "0.34.5" } }, "sha512-Ou9I5Ft9WNcCbXrU9cMgPBcCK8LiwLqcbywW3t4oDV37n1pzpuNLsYiAV8eODnjbtQlSDwZ2cUEeQz4E54Hltg=="], + + "swr": ["swr@2.4.1", "", { "dependencies": { "dequal": "^2.0.3", "use-sync-external-store": "^1.6.0" }, "peerDependencies": { "react": "^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" } }, "sha512-2CC6CiKQtEwaEeNiqWTAw9PGykW8SR5zZX8MZk6TeAvEAnVS7Visz8WzphqgtQ8v2xz/4Q5K+j+SeMaKXeeQIA=="], + + "throttleit": ["throttleit@2.1.0", "", {}, "sha512-nt6AMGKW1p/70DF/hGBdJB57B8Tspmbp5gfJ8ilhLnt7kkr2ye7hzD6NVG8GGErk2HWF34igrL2CXmNIkzKqKw=="], + + "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], + + "use-sync-external-store": ["use-sync-external-store@1.6.0", "", { "peerDependencies": { "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" } }, "sha512-Pp6GSwGP/NrPIrxVFAIkOQeyw8lFenOHijQWkUTrDvrF4ALqylP2C/KCkeS9dpUM3KvYRQhna5vt7IL95+ZQ9w=="], + + "zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="], + + "zod-to-json-schema": ["zod-to-json-schema@3.25.2", "", { "peerDependencies": { "zod": "^3.25.28 || ^4" } }, "sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA=="], + } +} diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..c8f63cf --- /dev/null +++ b/install.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +SKILL_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "Installing video-product-snapshot skill..." + +# Check dependencies +for cmd in ffmpeg ffprobe bun; do + if ! command -v "$cmd" &>/dev/null; then + echo "ERROR: '$cmd' not found in PATH. Please install it first." >&2 + exit 1 + fi +done + +# Install npm dependencies +cd "$SKILL_DIR" +bun install + +# Build dist +bun run build + +echo "Done. Run with: bun dist/run.js [args]" +echo "Requires: ANTHROPIC_API_KEY env var for product detection." diff --git a/package.json b/package.json new file mode 100644 index 0000000..b25dda3 --- /dev/null +++ b/package.json @@ -0,0 +1,15 @@ +{ + "name": "video-product-snapshot", + "version": "0.1.0", + "type": "module", + "scripts": { + "run": "bun run scripts/run.ts", + "build": "bun build scripts/run.ts --outfile dist/run.js --target bun" + }, + "dependencies": { + "@ai-sdk/openai": "^1.3.22", + "ai": "^4.3.16", + "sharp": "^0.34.5", + "zod": "^3.24.2" + } +} diff --git a/scripts/run.ts b/scripts/run.ts new file mode 100644 index 0000000..a5d98b8 --- /dev/null +++ b/scripts/run.ts @@ -0,0 +1,85 @@ +#!/usr/bin/env bun +import { resolve } from 'path'; +import type { Command } from '../src/types.ts'; +import { run } from '../src/index.ts'; + +// Load .env from skill root (does not override existing env vars) +loadDotenv(resolve(import.meta.dir, '../.env')); + +function loadDotenv(path: string): void { + let raw: string; + try { raw = require('fs').readFileSync(path, 'utf-8'); } catch { return; } + for (const line of raw.split('\n')) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith('#')) continue; + const eq = trimmed.indexOf('='); + if (eq < 0) continue; + const key = trimmed.slice(0, eq).trim(); + const val = trimmed.slice(eq + 1).trim().replace(/^["']|["']$/g, ''); + if (key && !(key in process.env)) process.env[key] = val; + } +} + +function printUsage(): void { + console.error(`Usage: + bun scripts/run.ts [--api-base=] [args...] [--dry-run] + +Commands: + session + Get auth session token + + detect [options] + Extract frames and detect ecommerce product snapshots + Options: + --interval= Frame sampling interval (default: 3) + --max-frames= Max frames to analyze (default: 30) + --output-dir= Where to save snapshots (default: next to video) + --min-confidence=<0-1> Minimum detection confidence (default: 0.7) + --concurrency= Parallel Vision API calls per chunk (default: 5) + + search + Search for products using an image via the ecom image-search API + + detect-and-search [options] + Detect best product snapshot from video then run image search + +Examples: + bun scripts/run.ts detect ./demo.mp4 + bun scripts/run.ts detect ./demo.mp4 --interval=5 --max-frames=20 + bun scripts/run.ts search ./snapshot.jpg + bun scripts/run.ts detect-and-search ./demo.mp4 --min-confidence=0.8 + +Config: ANTHROPIC_API_KEY env var required for detection. + auth-rt in PATH required for search commands. +`); +} + +async function main(): Promise { + const positionals: string[] = []; + let dryRun = false; + + for (const arg of process.argv.slice(2)) { + if (arg === '--dry-run') { + dryRun = true; + } else if (arg.startsWith('--api-base=')) { + process.env.API_BASE = arg.slice('--api-base='.length).trim(); + } else if (arg === '-h' || arg === '--help') { + printUsage(); process.exit(0); + } else { + positionals.push(arg); + } + } + + if (positionals.length < 1) { printUsage(); process.exit(1); } + + const result = await run(positionals[0] as Command, positionals.slice(1), dryRun); + console.log(JSON.stringify(result, null, 2)); +} + +main().catch((err) => { + console.error(JSON.stringify({ + status: 'failed', + error: err instanceof Error ? err.message : String(err), + }, null, 2)); + process.exit(1); +}); diff --git a/src/auth-cli.ts b/src/auth-cli.ts new file mode 100644 index 0000000..d072a88 --- /dev/null +++ b/src/auth-cli.ts @@ -0,0 +1,119 @@ +/** + * Thin CLI wrapper for auth-runtime. + * + * Copy this file into your skill's src/ directory. It calls the + * `auth-rt` binary (a standalone Go executable), so the skill has + * zero npm/runtime dependency on auth-runtime. + * + * Prerequisites: + * `auth-rt` must be in PATH or at ~/.local/bin/auth-rt + * (install.sh handles this automatically) + * + * Usage: + * import { createSkillClient } from './auth-cli.ts'; + * const client = createSkillClient(); + * const res = await client.post('/ecom/tasks/scrape', { url: '...' }); + */ + +import { spawnSync } from 'child_process'; +import * as path from 'path'; +import * as os from 'os'; + +const home = process.env.HOME || os.homedir(); +const AUTH_RT_BIN = process.env.AUTH_RT_BIN + || (() => { + // Check if auth-rt is in PATH + const which = spawnSync('which', ['auth-rt'], { encoding: 'utf-8' }); + if (which.status === 0 && which.stdout.trim()) { + return which.stdout.trim(); + } + return path.join(home, '.local', 'bin', 'auth-rt'); + })(); + +export interface ApiResponse { + status: number; + body: string; +} + +export interface SessionResponse { + accessToken: string; + expiresIn: number; + ownerSessionToken?: string; + hookUrl?: string; + hookToken?: string; +} + +export interface SkillClientOptions { + apiBase?: string; + dryRun?: boolean; +} + +function runCli(...args: string[]): string { + const result = spawnSync(AUTH_RT_BIN, args, { + encoding: 'utf-8', + timeout: 60_000, + }); + + if (result.error) { + throw new Error(`auth-rt spawn failed: ${result.error.message}`); + } + if (result.status !== 0) { + throw new Error(`auth-rt failed (exit ${result.status}): ${(result.stderr || '').trim()}`); + } + return (result.stdout || '').trim(); +} + +export class SkillClient { + private readonly apiBase?: string; + private readonly dryRun: boolean; + + constructor(options: SkillClientOptions = {}) { + this.apiBase = options.apiBase; + this.dryRun = options.dryRun ?? false; + } + + async session(): Promise { + if (this.dryRun) { + return { accessToken: '', expiresIn: 900 }; + } + return JSON.parse(runCli('session')); + } + + async get(urlPath: string): Promise { + return this.request('GET', urlPath); + } + + async post(urlPath: string, body?: unknown): Promise { + return this.request('POST', urlPath, body); + } + + async put(urlPath: string, body?: unknown): Promise { + return this.request('PUT', urlPath, body); + } + + async patch(urlPath: string, body?: unknown): Promise { + return this.request('PATCH', urlPath, body); + } + + async delete(urlPath: string, body?: unknown): Promise { + return this.request('DELETE', urlPath, body); + } + + private async request(method: string, urlPath: string, body?: unknown): Promise { + if (this.dryRun) { + return { status: 200, body: JSON.stringify({ dryRun: true, method, path: urlPath }) }; + } + const args = ['request', method, urlPath]; + if (body != null) { + args.push('--body', JSON.stringify(body)); + } + if (this.apiBase) { + args.push('--api-base', this.apiBase); + } + return JSON.parse(runCli(...args)); + } +} + +export function createSkillClient(options?: SkillClientOptions): SkillClient { + return new SkillClient(options); +} diff --git a/src/frame-extractor.ts b/src/frame-extractor.ts new file mode 100644 index 0000000..d870218 --- /dev/null +++ b/src/frame-extractor.ts @@ -0,0 +1,80 @@ +import { spawnSync, execSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; + +export interface ExtractedFrame { + frameIndex: number; + timestampSeconds: number; + imagePath: string; +} + +export function ensureOutputDir(dir: string): void { + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir, { recursive: true }); + } +} + +export function getVideoDuration(videoPath: string): number { + const result = spawnSync('ffprobe', [ + '-v', 'quiet', + '-print_format', 'json', + '-show_format', + videoPath, + ], { encoding: 'utf-8', timeout: 30_000 }); + + if (result.status !== 0) { + throw new Error(`ffprobe failed: ${result.stderr}`); + } + + const info = JSON.parse(result.stdout); + return parseFloat(info.format?.duration ?? '0'); +} + +export function extractFrames( + videoPath: string, + outputDir: string, + intervalSeconds: number, + maxFrames: number, +): ExtractedFrame[] { + ensureOutputDir(outputDir); + + const duration = getVideoDuration(videoPath); + const framePattern = path.join(outputDir, 'frame_%04d.jpg'); + + // Extract one frame every intervalSeconds, up to maxFrames + const fps = 1 / intervalSeconds; + const result = spawnSync('ffmpeg', [ + '-i', videoPath, + '-vf', `fps=${fps},scale=1280:-1`, + '-frames:v', String(maxFrames), + '-q:v', '2', + '-y', + framePattern, + ], { encoding: 'utf-8', timeout: 300_000 }); + + if (result.status !== 0 && !fs.existsSync(path.join(outputDir, 'frame_0001.jpg'))) { + throw new Error(`ffmpeg extraction failed: ${result.stderr}`); + } + + const frames: ExtractedFrame[] = []; + let frameIndex = 1; + while (true) { + const paddedIndex = String(frameIndex).padStart(4, '0'); + const imagePath = path.join(outputDir, `frame_${paddedIndex}.jpg`); + if (!fs.existsSync(imagePath)) break; + + frames.push({ + frameIndex, + timestampSeconds: Math.round((frameIndex - 1) * intervalSeconds), + imagePath, + }); + frameIndex++; + } + + return frames; +} + +export function imageToBase64(imagePath: string): string { + const buffer = fs.readFileSync(imagePath); + return buffer.toString('base64'); +} diff --git a/src/index.ts b/src/index.ts new file mode 100644 index 0000000..ee2722b --- /dev/null +++ b/src/index.ts @@ -0,0 +1,336 @@ +import * as fs from 'fs'; +import * as path from 'path'; +import type { Command, DetectOptions, DetectResult, SearchResult, OutputResult, SearchItem } from './types.ts'; +import { createSkillClient } from './auth-cli.ts'; +import { extractFrames } from './frame-extractor.ts'; +import { detectProductFrames } from './product-detector.ts'; +import { imageToBase64 } from './frame-extractor.ts'; +import { generateText } from 'ai'; +import { createOpenAI } from '@ai-sdk/openai'; + +export async function run( + command: Command, + args: string[], + dryRun: boolean, +): Promise { + switch (command) { + case 'session': + return runSession(dryRun); + case 'detect': + return runDetect(args, dryRun); + case 'search': + return runSearch(args, dryRun); + case 'detect-and-search': + return runDetectAndSearch(args, dryRun); + case 'rerank': + return runRerank(args, dryRun); + default: + return { status: 'failed', command, dryRun, error: `unknown command: ${command}` }; + } +} + +async function runSession(dryRun: boolean): Promise { + const client = createSkillClient({ dryRun }); + const session = await client.session(); + return { status: 'success', command: 'session', dryRun, ...session } as any; +} + +async function runDetect(args: string[], dryRun: boolean): Promise { + const videoPath = args[0]; + if (!videoPath) return { status: 'failed', command: 'detect', dryRun, error: 'detect requires ' }; + if (!fs.existsSync(videoPath)) return { status: 'failed', command: 'detect', dryRun, error: `video not found: ${videoPath}` }; + + const opts = parseDetectOptions(videoPath, args); + + if (dryRun) { + return { + status: 'success', command: 'detect', dryRun, + videoPath, totalFramesExtracted: 0, productFrames: [], + bestSnapshot: undefined, + }; + } + + const frames = extractFrames(videoPath, opts.outputDir, opts.intervalSeconds, opts.maxFrames); + const productFrames = await detectProductFrames(frames, opts.minConfidence, opts.concurrency); + + return { + status: 'success', + command: 'detect', + dryRun, + videoPath, + totalFramesExtracted: frames.length, + productFrames, + bestSnapshot: productFrames[0], + }; +} + +async function uploadImage(imagePath: string): Promise { + const searchEndpoint = process.env.ONEBOUND_SEARCH_ENDPOINT; + if (!searchEndpoint) throw new Error('ONEBOUND_SEARCH_ENDPOINT not set'); + + const uploadEndpoint = process.env.ONEBOUND_UPLOAD_ENDPOINT; + if (!uploadEndpoint) throw new Error('ONEBOUND_UPLOAD_ENDPOINT not set'); + + const imageBuffer = fs.readFileSync(imagePath); + const filename = `video-snapshot-${Date.now()}.jpg`; + + const response = await fetch(uploadEndpoint, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + data: imageBuffer.toString('base64'), + filename, + contentType: 'image/jpeg', + }), + }); + + if (!response.ok) throw new Error(`Upload failed: HTTP ${response.status}`); + const json = await response.json() as { url?: string }; + if (!json.url) throw new Error('Upload response missing url'); + return json.url; +} + +async function runSearch(args: string[], dryRun: boolean): Promise { + const imagePath = args[0]; + if (!imagePath) return { status: 'failed', command: 'search', dryRun, error: 'search requires ' }; + if (!fs.existsSync(imagePath)) return { status: 'failed', command: 'search', dryRun, error: `image not found: ${imagePath}` }; + + const searchEndpoint = process.env.ONEBOUND_SEARCH_ENDPOINT; + if (!searchEndpoint) return { status: 'failed', command: 'search', dryRun, error: 'ONEBOUND_SEARCH_ENDPOINT not set' }; + + if (dryRun) { + return { status: 'success', command: 'search', dryRun, imagePath, searchHttpStatus: 0, searchBody: null }; + } + + // If given a local file, upload it first to get a public URL + let imgid = imagePath; + if (!imagePath.startsWith('http')) { + imgid = await uploadImage(imagePath); + } + + const response = await fetch(searchEndpoint, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ imgid, page: 1 }), + }); + + const searchHttpStatus = response.status; + const body = await response.json(); + + if (!response.ok) { + return { status: 'failed', command: 'search', dryRun, imagePath, searchHttpStatus, error: JSON.stringify(body) }; + } + + return { status: 'success', command: 'search', dryRun, imagePath, searchHttpStatus, searchBody: body }; +} + +async function runDetectAndSearch(args: string[], dryRun: boolean): Promise { + const detectResult = await runDetect(args, dryRun) as DetectResult; + if (detectResult.status === 'failed') return detectResult; + + if (!detectResult.bestSnapshot) { + return { ...detectResult, status: 'failed', error: 'no product detected in video' }; + } + + const best = detectResult.bestSnapshot; + // Use cropped image if available, otherwise full frame + const imageForSearch = best.croppedImagePath || best.imagePath; + const searchResult = await runSearch([imageForSearch], dryRun) as SearchResult; + + // Auto-rerank using product description to generate Chinese keyword + let rerankResult: any = undefined; + if (!dryRun && searchResult.status === 'success' && searchResult.searchBody) { + // Save search body to temp file for rerank + const tmpFile = path.join(path.dirname(imageForSearch), `search_body_${Date.now()}.json`); + try { + fs.writeFileSync(tmpFile, JSON.stringify(searchResult.searchBody)); + const rerankArgs = [ + `--image-results=${tmpFile}`, + `--description=${best.description}`, + '--top=10', + ]; + rerankResult = await runRerank(rerankArgs, dryRun); + } catch (e: any) { + rerankResult = { error: e.message }; + } finally { + try { fs.unlinkSync(tmpFile); } catch {} + } + } + + return { + ...detectResult, + command: 'detect-and-search', + searchHttpStatus: searchResult.searchHttpStatus, + searchBody: searchResult.searchBody, + searchError: searchResult.error, + rerank: rerankResult, + } as any; +} + +function parseDetectOptions(videoPath: string, args: string[]): DetectOptions { + const outputDir = getFlag(args, '--output-dir') || path.join( + path.dirname(videoPath), + `snapshots_${path.basename(videoPath, path.extname(videoPath))}_${Date.now()}`, + ); + + return { + videoPath, + intervalSeconds: parseInt(getFlag(args, '--interval') || '1', 10), + maxFrames: parseInt(getFlag(args, '--max-frames') || '60', 10), + outputDir, + minConfidence: parseFloat(getFlag(args, '--min-confidence') || '0.7'), + concurrency: parseInt(getFlag(args, '--concurrency') || '5', 10), + }; +} + +function getFlag(args: string[], flag: string): string | undefined { + for (const arg of args) { + if (arg.startsWith(`${flag}=`)) return arg.slice(flag.length + 1); + } + return undefined; +} + +function createVisionModel() { + const apiKey = process.env.VISION_API_KEY; + if (!apiKey) throw new Error('VISION_API_KEY not set'); + const baseURL = process.env.VISION_API_BASE || undefined; + const modelName = process.env.VISION_MODEL || 'gpt-4o-mini'; + const openai = createOpenAI({ apiKey, baseURL }); + return openai(modelName); +} + +async function generateChineseKeyword(description: string): Promise { + const model = createVisionModel(); + const { text } = await generateText({ + model, + prompt: `You are generating a 1688.com (Chinese B2B wholesale) product search keyword. +Rules: +- Output ONLY 2-4 Chinese words — the product category + 1-2 key material/feature words +- Use common Chinese commerce terms, NOT a literal translation +- No English, no punctuation, no explanation +- Short broad terms work better than long specific phrases (e.g. "金属鞋架" not "黑色Z型金属网格鞋架") + +Product description: ${description} + +Output only the search query:`, + }); + return text.trim().replace(/[^\u4e00-\u9fff\u3400-\u4dbf]/g, '').trim(); +} + +async function keywordSearch(keyword: string, page = 1): Promise { + const endpoint = process.env.ONEBOUND_KEYWORD_SEARCH_ENDPOINT; + if (!endpoint) throw new Error('ONEBOUND_KEYWORD_SEARCH_ENDPOINT not set'); + + const res = await fetch(endpoint, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ keyword, page }), + }); + const json = await res.json() as any; + return (json?.data?.items?.item ?? []) as SearchItem[]; +} + +function hasChinese(str: string): boolean { + return /[\u4e00-\u9fff]/.test(str); +} + +function extractKeywordsFromTitles(items: SearchItem[], topN = 5): string { + // Pull the most repeated 2-char Chinese bigrams from top item titles as a fallback keyword + const freq: Record = {}; + for (const item of items.slice(0, topN)) { + const title = item.title || ''; + for (let i = 0; i < title.length - 1; i++) { + const bigram = title.slice(i, i + 2); + if (/[\u4e00-\u9fff]{2}/.test(bigram)) { + freq[bigram] = (freq[bigram] || 0) + 1; + } + } + } + return Object.entries(freq) + .sort((a, b) => b[1] - a[1]) + .slice(0, 3) + .map(([k]) => k) + .join(''); +} + +async function runRerank(args: string[], dryRun: boolean): Promise { + // --image-results= --keyword= --top= + const imageResultsArg = getFlag(args, '--image-results') || args[0]; + const keywordArg = getFlag(args, '--keyword') || args[1]; + const topN = parseInt(getFlag(args, '--top') || '10', 10); + + const description = getFlag(args, '--description') || ''; + + if (!imageResultsArg) return { status: 'failed', command: 'rerank', dryRun, error: 'rerank requires --image-results=' }; + + if (dryRun) return { status: 'success', command: 'rerank', dryRun } as any; + + // Load image search results + let imageItems: SearchItem[]; + try { + const raw = fs.existsSync(imageResultsArg) + ? fs.readFileSync(imageResultsArg, 'utf-8') + : imageResultsArg; + const parsed = JSON.parse(raw); + imageItems = parsed?.data?.items?.item ?? parsed?.items?.item ?? (Array.isArray(parsed) ? parsed : []); + } catch { + return { status: 'failed', command: 'rerank', dryRun, error: 'failed to parse image-results JSON' }; + } + + if (!imageItems.length) { + return { status: 'failed', command: 'rerank', dryRun, error: 'no items found in image-results JSON' }; + } + + // Determine Chinese keyword to use + let keyword = keywordArg || ''; + let autoGeneratedKeyword = ''; + + if (!hasChinese(keyword)) { + // Prefer product description for accurate translation; fall back to image titles + const sourceText = description || keyword || extractKeywordsFromTitles(imageItems); + try { + autoGeneratedKeyword = await generateChineseKeyword(sourceText); + } catch { + autoGeneratedKeyword = extractKeywordsFromTitles(imageItems); + } + keyword = autoGeneratedKeyword; + } + + // Keyword search on 1688 + let keywordItems: SearchItem[] = []; + try { + keywordItems = await keywordSearch(keyword); + } catch (e: any) { + return { status: 'failed', command: 'rerank', dryRun, error: `keyword search failed: ${e.message}` }; + } + + // Intersect by num_iid + const keywordIds = new Set(keywordItems.map((i) => String(i.num_iid))); + const intersected = imageItems.filter((i) => keywordIds.has(String(i.num_iid))); + + // If still no intersection, fall back to keyword results (at least they match the category) + const usedFallback = intersected.length === 0; + const results = usedFallback ? keywordItems : intersected; + + // Sort by turn_head descending (click-through rate signal) + const sorted = results + .sort((a, b) => parseFloat(String(b.turn_head ?? '0')) - parseFloat(String(a.turn_head ?? '0'))) + .slice(0, topN); + + return { + status: 'success', + command: 'rerank', + dryRun, + keyword, + autoGeneratedKeyword: autoGeneratedKeyword || undefined, + imageResultsCount: imageItems.length, + keywordResultsCount: keywordItems.length, + intersectedCount: intersected.length, + usedFallback, + results: sorted, + } as any; +} + +function parseJsonSafe(text: string): unknown { + try { return JSON.parse(text); } catch { return text; } +} diff --git a/src/product-detector.ts b/src/product-detector.ts new file mode 100644 index 0000000..bd34eb1 --- /dev/null +++ b/src/product-detector.ts @@ -0,0 +1,179 @@ +import { generateObject } from 'ai'; +import { createOpenAI } from '@ai-sdk/openai'; +import { z } from 'zod'; +import type { ExtractedFrame } from './frame-extractor.ts'; +import type { ProductFrame } from './types.ts'; +import { imageToBase64 } from './frame-extractor.ts'; + +// Pass 1: quick filter — discard frames that clearly have no product +const FilterSchema = z.object({ + keep: z.boolean(), + reason: z.enum(['product_visible', 'content_only', 'hands_only', 'blur', 'transition', 'background_only']), +}); + +// Pass 2: comparative ranking across all candidates +const RankingSchema = z.object({ + bestFrameIndex: z.number().int(), + description: z.string(), + reasoning: z.string(), + // normalized 0-1 relative to image dimensions: [x1, y1, x2, y2] + boundingBox: z.tuple([z.number(), z.number(), z.number(), z.number()]), +}); + +const FILTER_PROMPT = `You are filtering frames from a TikTok/Douyin ecommerce product video. + +Keep a frame (keep=true) ONLY if the HERO PRODUCT (the main item being sold) is at least partially visible as a recognizable object. +Discard (keep=false) if: only hands/texture/contents visible, motion blur, black/transition frame, or no product at all. + +reason options: product_visible | content_only | hands_only | blur | transition | background_only`; + +const RANKING_PROMPT = (count: number) => `You are selecting the single best product image from ${count} video frames for ecommerce image search. + +The frames are numbered 0 to ${count - 1} in the order shown. + +Pick the ONE frame where the HERO PRODUCT is: +1. Cleanest — fewest distractions, no hands blocking it, no clutter in foreground +2. Most complete — full product silhouette visible, no edges cropped +3. Most isolated — product stands out from background clearly +4. Empty/minimal load preferred — a product without contents (e.g. an empty rack) beats one stuffed with items if both show the full structure equally + +Return: +- bestFrameIndex: 0-based index of chosen frame +- description: concise search query under 12 words (product type + material + color + key feature) +- reasoning: one sentence explaining why this frame was chosen +- boundingBox: tight bounding box of the HERO PRODUCT ONLY in the chosen frame as [x1, y1, x2, y2] normalized 0.0–1.0 (top-left origin). Exclude hands, background, and unrelated objects. The product is assumed to be near the center.`; + +function createVisionModel() { + const apiKey = process.env.VISION_API_KEY; + if (!apiKey) throw new Error('VISION_API_KEY not set'); + + const provider = createOpenAI({ + apiKey, + baseURL: process.env.VISION_API_BASE, + }); + + return provider(process.env.VISION_MODEL ?? 'gpt-4o-mini'); +} + +async function filterFrame( + frame: ExtractedFrame, + model: ReturnType>, +): Promise { + const base64Image = imageToBase64(frame.imagePath); + + const { object } = await generateObject({ + model, + schema: FilterSchema, + messages: [{ + role: 'user', + content: [ + { type: 'image', image: `data:image/jpeg;base64,${base64Image}` }, + { type: 'text', text: FILTER_PROMPT }, + ], + }], + }); + + return object.keep; +} + +async function rankCandidates( + candidates: ExtractedFrame[], + model: ReturnType>, +): Promise<{ bestFrame: ExtractedFrame; description: string; reasoning: string; boundingBox: [number, number, number, number] }> { + const imageContent = candidates.map((f) => ({ + type: 'image' as const, + image: `data:image/jpeg;base64,${imageToBase64(f.imagePath)}`, + })); + + const { object } = await generateObject({ + model, + schema: RankingSchema, + mode: 'json', + messages: [{ + role: 'user', + content: [ + ...imageContent, + { type: 'text', text: RANKING_PROMPT(candidates.length) }, + ], + }], + }); + + const idx = Math.max(0, Math.min(object.bestFrameIndex, candidates.length - 1)); + return { + bestFrame: candidates[idx], + description: object.description, + reasoning: object.reasoning, + boundingBox: object.boundingBox, + }; +} + +export async function cropProduct( + imagePath: string, + boundingBox: [number, number, number, number], + outputPath: string, + paddingFactor = 0.05, +): Promise { + const sharp = (await import('sharp')).default; + const meta = await sharp(imagePath).metadata(); + const W = meta.width!; + const H = meta.height!; + + let [x1, y1, x2, y2] = boundingBox; + + // add padding + const pw = (x2 - x1) * paddingFactor; + const ph = (y2 - y1) * paddingFactor; + x1 = Math.max(0, x1 - pw); + y1 = Math.max(0, y1 - ph); + x2 = Math.min(1, x2 + pw); + y2 = Math.min(1, y2 + ph); + + const left = Math.round(x1 * W); + const top = Math.round(y1 * H); + const width = Math.round((x2 - x1) * W); + const height = Math.round((y2 - y1) * H); + + await sharp(imagePath) + .extract({ left, top, width, height }) + .jpeg({ quality: 95 }) + .toFile(outputPath); + + return outputPath; +} + +export async function detectProductFrames( + frames: ExtractedFrame[], + minConfidence: number, + concurrency: number = 5, +): Promise { + const model = createVisionModel(); + + // Pass 1: parallel filter — discard junk frames + const keepFlags: boolean[] = []; + for (let i = 0; i < frames.length; i += concurrency) { + const chunk = frames.slice(i, i + concurrency); + const flags = await Promise.all( + chunk.map((f) => filterFrame(f, model).catch(() => false)) + ); + keepFlags.push(...flags); + } + + const candidates = frames.filter((_, i) => keepFlags[i]); + if (candidates.length === 0) return []; + + // Pass 2: single comparative call — model sees all candidates at once + const { bestFrame, description, reasoning, boundingBox } = await rankCandidates(candidates, model); + + const croppedPath = bestFrame.imagePath.replace(/\.jpg$/, '_cropped.jpg'); + await cropProduct(bestFrame.imagePath, boundingBox, croppedPath); + + return [{ + frameIndex: bestFrame.frameIndex, + timestampSeconds: bestFrame.timestampSeconds, + imagePath: bestFrame.imagePath, + croppedImagePath: croppedPath, + confidence: 0.95, + description, + boundingHint: reasoning, + }]; +} diff --git a/src/types.ts b/src/types.ts new file mode 100644 index 0000000..5632ac3 --- /dev/null +++ b/src/types.ts @@ -0,0 +1,54 @@ +export type Command = 'detect' | 'search' | 'detect-and-search' | 'rerank' | 'session'; + +export interface SearchItem { + num_iid: number; + title: string; + pic_url: string; + price: string; + promotion_price?: string; + sales?: number; + turn_head?: string; + detail_url: string; +} + +export interface DetectOptions { + videoPath: string; + intervalSeconds: number; + maxFrames: number; + outputDir: string; + minConfidence: number; + concurrency: number; +} + +export interface ProductFrame { + frameIndex: number; + timestampSeconds: number; + imagePath: string; + croppedImagePath?: string; + confidence: number; + description: string; + boundingHint?: string; +} + +export interface DetectResult { + status: 'success' | 'failed'; + command: Command; + dryRun: boolean; + videoPath?: string; + totalFramesExtracted?: number; + productFrames?: ProductFrame[]; + bestSnapshot?: ProductFrame; + error?: string; +} + +export interface SearchResult { + status: 'success' | 'failed'; + command: Command; + dryRun: boolean; + imagePath?: string; + searchHttpStatus?: number; + searchBody?: unknown; + error?: string; +} + +export type OutputResult = DetectResult | SearchResult;