Compare commits

..

No commits in common. "main" and "v0.0.2" have entirely different histories.
main ... v0.0.2

5 changed files with 276 additions and 379 deletions

118
SKILL.md
View File

@ -1,97 +1,75 @@
--- ---
name: 1688-logistics-scraper name: 1688-logistics-scraper
description: "Scrape 1688 product pages via Chrome, capture full-page screenshots and detail images for vision-based extraction of weight/size data. Use when the user provides a 1688 product URL and needs logistics specs." description: "Extract product weight/size/logistics data from 1688 product pages via Chrome browser, output structured JSON. Use when the user provides a 1688 product URL and needs logistics specs."
--- ---
# 1688 Logistics Scraper # 1688 Logistics Scraper
Capture 1688 product pages and extract weight/size data via vision. Extract product weight, size, and logistics data from 1688 product pages.
## Run ## Run
```bash ```bash
bun scripts/run.ts scrape <url> [--dry-run] [--port=9222] bun scripts/run.ts scrape <url> [--dry-run]
```
### Examples
```bash
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html'
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dry-run
``` ```
## What It Does ## What It Does
1. Opens the 1688 product URL in the browser (default port 18800) 1. Opens the 1688 product URL in the browser
2. Scrolls through the entire page, capturing full-page screenshots 2. Extracts weight/size data from wherever it appears on the page — product attributes, variant specs, logistics section
3. Downloads all product detail images 3. Downloads detail images (商品详情图片) for analysis — weight/size is often only in images
4. Saves to `/tmp/1688-logistics/<offer-id>/` 4. Outputs structured JSON
## After Running — MUST follow ## Where To Look For Data
Read ALL screenshots and detail images, then output the following JSON structure. This is the final output for API consumption. Weight/size data on 1688 pages hides in multiple places. Check all before giving up:
1. **Product attributes** (商品属性 / 商品参数) — key-value table, most reliable
2. **Variant/SKU specs** — per-variant weight or size
3. **Logistics section** — shipping weight, volume, freight info
4. **Detail images** — downloaded to `/tmp/1688-logistics/<offer-id>/`, read them to find weight/size text baked into images
## Output
```json ```json
{ {
"offerId": "966107271425", "status": "success",
"url": "https://detail.1688.com/offer/966107271425.html", "url": "https://detail.1688.com/offer/...",
"title": "商品标题", "product": {
"weight": { "title": "产品标题",
"value": 0.15, "logistics": {
"unit": "kg", "weight": { "value": 0.5, "unit": "kg", "source": "attributes" },
"source": "商品属性" "dimensions": { "length": 30, "width": 20, "height": 10, "unit": "cm", "source": "attributes" },
"grossWeight": null,
"netWeight": null,
"packageWeight": null,
"volume": null,
"shippingMethod": null,
"shippingCost": null,
"origin": null
},
"variants": [
{ "name": "颜色: 红色", "weight": null, "dimensions": null }
]
}, },
"grossWeight": { "detailImages": ["/tmp/1688-logistics/852504650877/img_001.jpg"],
"value": 0.2, "rawAttributes": { "重量": "0.5kg", "尺寸": "30*20*10cm" }
"unit": "kg",
"source": "商品件重尺"
},
"netWeight": {
"value": 0.15,
"unit": "kg",
"source": "商品属性"
},
"dimensions": {
"length": 10,
"width": 8,
"height": 1.8,
"unit": "cm",
"source": "商品属性"
},
"volume": {
"value": 0.000144,
"unit": "m³",
"source": "商品件重尺"
},
"packageWeight": {
"value": 5.0,
"unit": "kg",
"source": "包装信息"
},
"packageDimensions": {
"length": 40,
"width": 30,
"height": 20,
"unit": "cm",
"source": "包装信息"
},
"unitsPerPackage": 50,
"variants": [
{
"name": "12支装",
"weight": { "value": 0.12, "unit": "kg" },
"dimensions": { "length": 9.5, "width": 6, "height": 2.2, "unit": "cm" }
}
]
} }
``` ```
### Field rules `null` = not found in text. Check `detailImages` — the data may be in the images.
- **All weight values normalized to kg** (克÷1000, 斤×0.5)
- **All dimension values normalized to cm** (mm÷10)
- **`source`**: where on the page the data was found (商品属性 / 商品件重尺 / 包装信息 / 详情图片)
- **`variants`**: only include if weight/size differs per SKU. Omit if all variants share the same specs.
- **Omit fields that are `null`** — do not include fields where no data was found
- **Do not guess.** Only include values actually visible on the page or in images.
## Rules ## Rules
1. If the browser is not running, report the error. Do not try to launch it. 1. If the browser is not running, report the error. Do not try to launch it.
2. No retries. If it fails, report as-is. 2. Check all data sources before reporting `null`.
3. Read ALL screenshots — logistics data can appear anywhere on the page. 3. Normalize units: 克→kg, 毫米→cm. Keep raw values in `rawAttributes`.
4. Read detail images too — weight/size is often baked into product photos. 4. No retries. If it fails, report as-is.
5. Output ONLY the structured JSON above. No extra commentary. 5. Trust page content. Do not guess values.

View File

@ -2,26 +2,8 @@
set -euo pipefail set -euo pipefail
cd "$(dirname "$0")" cd "$(dirname "$0")"
# Auto-install auth-rt if not found
if ! command -v auth-rt &>/dev/null && [ ! -x "$HOME/.local/bin/auth-rt" ]; then
echo "auth-rt not found, installing..."
_FORGEJO="http://192.168.0.108:3030"
_OS="$(uname -s | tr '[:upper:]' '[:lower:]')"
_ARCH="$(uname -m)"; case "$_ARCH" in x86_64) _ARCH="amd64";; aarch64) _ARCH="arm64";; esac
_URL="$_FORGEJO/agent-skills/auth-runtime/releases/download/latest/auth-rt-${_OS}-${_ARCH}"
mkdir -p "$HOME/.local/bin"
if curl -fsSL "$_URL" -o "$HOME/.local/bin/auth-rt" 2>/dev/null; then
chmod +x "$HOME/.local/bin/auth-rt"
echo "auth-rt installed (downloaded)"
else
echo "Download failed, building from source..."
_SRC="$HOME/.local/share/auth-runtime"
if [ -d "$_SRC/.git" ]; then git -C "$_SRC" pull --ff-only
else git clone --depth 1 "$_FORGEJO/agent-skills/auth-runtime.git" "$_SRC"
fi
bash "$_SRC/install.sh"
fi
fi
bun install bun install
echo "1688-logistics-scraper installed." echo "1688-logistics-scraper installed."
echo ""
echo "Prerequisites: Chrome must be running with remote debugging:"
echo " /Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222"

View File

@ -12,14 +12,17 @@ Commands:
Examples: Examples:
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html'
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dry-run bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dry-run
bun scripts/run.ts --port=18801 scrape 'https://detail.1688.com/offer/852504650877.html' bun scripts/run.ts --port=9223 scrape 'https://detail.1688.com/offer/852504650877.html'
Prerequisites:
Chrome must be running with --remote-debugging-port=9222
`); `);
} }
async function main(): Promise<void> { async function main(): Promise<void> {
const positionals: string[] = []; const positionals: string[] = [];
let dryRun = false; let dryRun = false;
let port = 18800; let port = 9222;
for (const arg of process.argv.slice(2)) { for (const arg of process.argv.slice(2)) {
if (arg === '--dry-run') { if (arg === '--dry-run') {

View File

@ -1,119 +0,0 @@
/**
* Thin CLI wrapper for auth-runtime.
*
* Copy this file into your skill's src/ directory. It calls the
* `auth-rt` binary (a standalone Go executable), so the skill has
* zero npm/runtime dependency on auth-runtime.
*
* Prerequisites:
* `auth-rt` must be in PATH or at ~/.local/bin/auth-rt
* (install.sh handles this automatically)
*
* Usage:
* import { createSkillClient } from './auth-cli.ts';
* const client = createSkillClient();
* const res = await client.post('/ecom/tasks/scrape', { url: '...' });
*/
import { spawnSync } from 'child_process';
import * as path from 'path';
import * as os from 'os';
const home = process.env.HOME || os.homedir();
const AUTH_RT_BIN = process.env.AUTH_RT_BIN
|| (() => {
// Check if auth-rt is in PATH
const which = spawnSync('which', ['auth-rt'], { encoding: 'utf-8' });
if (which.status === 0 && which.stdout.trim()) {
return which.stdout.trim();
}
return path.join(home, '.local', 'bin', 'auth-rt');
})();
export interface ApiResponse {
status: number;
body: string;
}
export interface SessionResponse {
accessToken: string;
expiresIn: number;
ownerSessionToken?: string;
hookUrl?: string;
hookToken?: string;
}
export interface SkillClientOptions {
apiBase?: string;
dryRun?: boolean;
}
function runCli(...args: string[]): string {
const result = spawnSync(AUTH_RT_BIN, args, {
encoding: 'utf-8',
timeout: 60_000,
});
if (result.error) {
throw new Error(`auth-rt spawn failed: ${result.error.message}`);
}
if (result.status !== 0) {
throw new Error(`auth-rt failed (exit ${result.status}): ${(result.stderr || '').trim()}`);
}
return (result.stdout || '').trim();
}
export class SkillClient {
private readonly apiBase?: string;
private readonly dryRun: boolean;
constructor(options: SkillClientOptions = {}) {
this.apiBase = options.apiBase;
this.dryRun = options.dryRun ?? false;
}
async session(): Promise<SessionResponse> {
if (this.dryRun) {
return { accessToken: '<dry-run-token>', expiresIn: 900 };
}
return JSON.parse(runCli('session'));
}
async get(urlPath: string): Promise<ApiResponse> {
return this.request('GET', urlPath);
}
async post(urlPath: string, body?: unknown): Promise<ApiResponse> {
return this.request('POST', urlPath, body);
}
async put(urlPath: string, body?: unknown): Promise<ApiResponse> {
return this.request('PUT', urlPath, body);
}
async patch(urlPath: string, body?: unknown): Promise<ApiResponse> {
return this.request('PATCH', urlPath, body);
}
async delete(urlPath: string, body?: unknown): Promise<ApiResponse> {
return this.request('DELETE', urlPath, body);
}
private async request(method: string, urlPath: string, body?: unknown): Promise<ApiResponse> {
if (this.dryRun) {
return { status: 200, body: JSON.stringify({ dryRun: true, method, path: urlPath }) };
}
const args = ['request', method, urlPath];
if (body != null) {
args.push('--body', JSON.stringify(body));
}
if (this.apiBase) {
args.push('--api-base', this.apiBase);
}
return JSON.parse(runCli(...args));
}
}
export function createSkillClient(options?: SkillClientOptions): SkillClient {
return new SkillClient(options);
}

View File

@ -3,16 +3,50 @@ import * as path from 'path';
export type Command = 'scrape'; export type Command = 'scrape';
export interface LogisticsValue {
value: number | null;
unit: string | null;
source: string;
}
export interface Dimensions {
length: number | null;
width: number | null;
height: number | null;
unit: string | null;
source: string;
}
export interface LogisticsData {
weight: LogisticsValue | null;
dimensions: Dimensions | null;
grossWeight: LogisticsValue | null;
netWeight: LogisticsValue | null;
packageWeight: LogisticsValue | null;
volume: LogisticsValue | null;
shippingMethod: string | null;
shippingCost: string | null;
origin: string | null;
}
export interface VariantInfo {
name: string;
weight: LogisticsValue | null;
dimensions: Dimensions | null;
}
export interface ScrapeResult { export interface ScrapeResult {
status: 'success' | 'failed'; status: 'success' | 'failed';
url: string; url: string;
command: Command; command: Command;
dryRun: boolean; dryRun: boolean;
offerId: string; product?: {
productPackInfo?: unknown; title: string;
windowContext?: unknown; logistics: LogisticsData;
screenshots?: string[]; variants: VariantInfo[];
};
detailImages?: string[]; detailImages?: string[];
rawAttributes?: Record<string, string>;
error?: string; error?: string;
} }
@ -28,10 +62,9 @@ class CdpSession {
private ws!: WebSocket; private ws!: WebSocket;
private msgId = 0; private msgId = 0;
private pending = new Map<number, { resolve: (v: any) => void; reject: (e: Error) => void }>(); private pending = new Map<number, { resolve: (v: any) => void; reject: (e: Error) => void }>();
private eventListeners = new Map<string, Array<(params: any) => void>>();
static async connect(port: number): Promise<CdpSession> { static async connect(port: number): Promise<CdpSession> {
const resp = await fetch(`http://localhost:${port}/json`); const resp = await fetch(`http://127.0.0.1:${port}/json`);
const targets = (await resp.json()) as Array<{ webSocketDebuggerUrl: string; type: string }>; const targets = (await resp.json()) as Array<{ webSocketDebuggerUrl: string; type: string }>;
const page = targets.find(t => t.type === 'page'); const page = targets.find(t => t.type === 'page');
if (!page) throw new Error('No Chrome page tab found. Open a tab first.'); if (!page) throw new Error('No Chrome page tab found. Open a tab first.');
@ -46,20 +79,13 @@ class CdpSession {
this.ws.onopen = () => resolve(); this.ws.onopen = () => resolve();
this.ws.onerror = (e: any) => reject(new Error(`WebSocket error: ${e.message || e}`)); this.ws.onerror = (e: any) => reject(new Error(`WebSocket error: ${e.message || e}`));
this.ws.onmessage = (ev: MessageEvent) => { this.ws.onmessage = (ev: MessageEvent) => {
const msg = JSON.parse(typeof ev.data === 'string' ? ev.data : ev.data.toString()); const msg: CdpResult = JSON.parse(typeof ev.data === 'string' ? ev.data : ev.data.toString());
// Handle command responses
if (msg.id != null && this.pending.has(msg.id)) { if (msg.id != null && this.pending.has(msg.id)) {
const p = this.pending.get(msg.id)!; const p = this.pending.get(msg.id)!;
this.pending.delete(msg.id); this.pending.delete(msg.id);
if (msg.error) p.reject(new Error(msg.error.message)); if (msg.error) p.reject(new Error(msg.error.message));
else p.resolve(msg.result); else p.resolve(msg.result);
} }
// Handle events
if (msg.method && this.eventListeners.has(msg.method)) {
for (const fn of this.eventListeners.get(msg.method)!) {
fn(msg.params);
}
}
}; };
}); });
} }
@ -72,117 +98,145 @@ class CdpSession {
}); });
} }
waitForEvent(event: string, timeoutMs: number = 30000): Promise<any> {
return new Promise((resolve, reject) => {
const timer = setTimeout(() => {
cleanup();
reject(new Error(`Timeout waiting for ${event}`));
}, timeoutMs);
const handler = (params: any) => {
cleanup();
resolve(params);
};
const cleanup = () => {
clearTimeout(timer);
const listeners = this.eventListeners.get(event);
if (listeners) {
const idx = listeners.indexOf(handler);
if (idx >= 0) listeners.splice(idx, 1);
}
};
if (!this.eventListeners.has(event)) this.eventListeners.set(event, []);
this.eventListeners.get(event)!.push(handler);
});
}
async evaluate(expression: string): Promise<any> { async evaluate(expression: string): Promise<any> {
const res = await this.send('Runtime.evaluate', { expression, returnByValue: true }); const res = await this.send('Runtime.evaluate', { expression, returnByValue: true });
return res?.result?.value; return res?.result?.value;
} }
async captureScreenshot(format: string = 'png'): Promise<Buffer> {
const res = await this.send('Page.captureScreenshot', {
format,
captureBeyondViewport: false,
});
return Buffer.from(res.data, 'base64');
}
close() { close() {
try { this.ws.close(); } catch {} try { this.ws.close(); } catch {}
} }
} }
// --- Helpers --- // --- Parsers ---
const WEIGHT_KEYS = ['重量', '毛重', '净重', '单件重量', '包装重量', '产品重量', '单品重量', 'weight'];
const DIMENSION_KEYS = ['尺寸', '规格', '长宽高', '外箱尺寸', '包装尺寸', '产品尺寸', '大小', 'size', 'dimensions'];
const VOLUME_KEYS = ['体积', '容积', 'volume'];
function extractOfferId(url: string): string { function extractOfferId(url: string): string {
return url.match(/offer\/(\d+)/)?.[1] || 'unknown'; return url.match(/offer\/(\d+)/)?.[1] || 'unknown';
} }
async function scrollAndCapture( function parseWeight(raw: string): LogisticsValue | null {
cdp: CdpSession, const m = raw.match(/([\d.]+)\s*(kg|g|克|千克|公斤|斤)/i);
outputDir: string, if (!m) return null;
): Promise<string[]> { let value = parseFloat(m[1]);
fs.mkdirSync(outputDir, { recursive: true }); let unit = m[2].toLowerCase();
const saved: string[] = []; if (unit === 'g' || unit === '克') { value /= 1000; unit = 'kg'; }
if (unit === '千克' || unit === '公斤') unit = 'kg';
// Get page height if (unit === '斤') { value *= 0.5; unit = 'kg'; }
const pageHeight: number = await cdp.evaluate( return { value, unit, source: '' };
'Math.max(document.body.scrollHeight, document.documentElement.scrollHeight)'
) || 0;
const viewportHeight: number = await cdp.evaluate('window.innerHeight') || 900;
// Scroll through the page and capture viewport-sized screenshots
// Use 80% step to overlap slightly and avoid missing content at boundaries
const step = Math.floor(viewportHeight * 0.8);
let scrollY = 0;
let idx = 1;
while (scrollY < pageHeight) {
await cdp.evaluate(`window.scrollTo(0, ${scrollY})`);
await new Promise(r => setTimeout(r, 800)); // wait for lazy-load render
const buf = await cdp.captureScreenshot('png');
const filePath = path.join(outputDir, `page_${String(idx).padStart(3, '0')}.png`);
fs.writeFileSync(filePath, buf);
saved.push(filePath);
scrollY += step;
idx++;
}
return saved;
} }
async function downloadDetailImages( function parseDimensions(raw: string): Dimensions | null {
cdp: CdpSession, const m = raw.match(/([\d.]+)\s*[*xX×]\s*([\d.]+)\s*[*xX×]\s*([\d.]+)\s*(cm|mm|毫米|厘米|m|米)?/i);
outputDir: string, if (!m) return null;
): Promise<string[]> { let [l, w, h] = [parseFloat(m[1]), parseFloat(m[2]), parseFloat(m[3])];
// Get all detail image URLs from the page let unit = (m[4] || 'cm').toLowerCase();
const imgUrls: string[] = JSON.parse(await cdp.evaluate(` if (unit === 'mm' || unit === '毫米') { l /= 10; w /= 10; h /= 10; unit = 'cm'; }
(function() { if (unit === '厘米') unit = 'cm';
const imgs = [], seen = new Set(); if (unit === 'm' || unit === '米') { l *= 100; w *= 100; h *= 100; unit = 'cm'; }
document.querySelectorAll('img').forEach(img => { return { length: l, width: w, height: h, unit, source: '' };
const src = img.src || img.dataset.src || img.dataset.lazySrc || ''; }
if (src && !seen.has(src) && (src.startsWith('http') || src.startsWith('//'))) {
// Filter for product detail images (skip tiny icons/avatars)
if (img.naturalWidth > 200 || img.width > 200 || !img.complete) {
seen.add(src);
imgs.push(src.startsWith('//') ? 'https:' + src : src);
}
}
});
return JSON.stringify(imgs);
})()
`) || '[]');
function parseVolume(raw: string): LogisticsValue | null {
const m = raw.match(/([\d.]+)\s*(m³|cm³|L|ml|升|毫升|立方米|立方厘米)/i);
if (!m) return null;
return { value: parseFloat(m[1]), unit: m[2], source: '' };
}
function matchKey(text: string, keys: string[]): boolean {
const lower = text.toLowerCase();
return keys.some(k => lower.includes(k.toLowerCase()));
}
// --- Page extraction ---
const JS_EXTRACT_ATTRS = `
(function() {
const attrs = {};
const sels = [
'.detail-attributes-list .attributes-item',
'.obj-leading .obj-content li',
'#mod-detail-attributes .attribute-item',
'.detail-info table tr',
'[class*="attribute"] li',
'[class*="param"] li',
'.offer-attr-list .offer-attr-item',
];
for (const sel of sels) {
document.querySelectorAll(sel).forEach(el => {
const parts = el.textContent.trim().split(/[:]/);
if (parts.length >= 2) attrs[parts[0].trim()] = parts.slice(1).join(':').trim();
});
}
document.querySelectorAll('table tr, .detail-attributes-list tr').forEach(tr => {
const cells = tr.querySelectorAll('td, th');
if (cells.length >= 2) attrs[cells[0].textContent.trim()] = cells[1].textContent.trim();
});
return JSON.stringify(attrs);
})()`;
const JS_EXTRACT_VARIANTS = `
(function() {
const variants = [];
const sels = [
'.sku-item-wrapper .sku-item',
'[class*="sku"] [class*="item"]',
'.obj-sku .obj-content li',
'.unit-detail-spec-operator .spec-item',
];
for (const sel of sels) {
document.querySelectorAll(sel).forEach(el => {
const name = el.textContent.trim().replace(/\\s+/g, ' ');
if (name && name.length < 200) variants.push({ name, text: el.textContent });
});
}
return JSON.stringify(variants);
})()`;
const JS_EXTRACT_TITLE = `
(function() {
for (const sel of ['.title-text','.detail-title-text','h1[class*="title"]','.mod-detail-title h1','.d-title']) {
const el = document.querySelector(sel);
if (el && el.textContent.trim()) return el.textContent.trim();
}
return document.title || '';
})()`;
const JS_EXTRACT_IMAGES = `
(function() {
const imgs = [], seen = new Set();
const sels = [
'#desc-lazyload-container img',
'.detail-desc-decorate-richtext img',
'[class*="detail-desc"] img',
'.mod-detail-description img',
'.offer-attr-item img',
'.desc-img-loaded img',
];
for (const sel of sels) {
document.querySelectorAll(sel).forEach(img => {
const src = img.src || img.dataset.src || img.dataset.lazySrc || '';
if (src && !seen.has(src) && (src.startsWith('http') || src.startsWith('//'))) {
seen.add(src);
imgs.push(src.startsWith('//') ? 'https:' + src : src);
}
});
}
return JSON.stringify(imgs);
})()`;
async function downloadImages(urls: string[], outputDir: string): Promise<string[]> {
fs.mkdirSync(outputDir, { recursive: true }); fs.mkdirSync(outputDir, { recursive: true });
const saved: string[] = []; const saved: string[] = [];
for (let i = 0; i < imgUrls.length; i++) { for (let i = 0; i < urls.length; i++) {
try { try {
const resp = await fetch(imgUrls[i]); const resp = await fetch(urls[i]);
if (!resp.ok) continue; if (!resp.ok) continue;
const buf = Buffer.from(await resp.arrayBuffer()); const buf = Buffer.from(await resp.arrayBuffer());
const ext = imgUrls[i].match(/\.(jpg|jpeg|png|webp|gif)/i)?.[1] || 'jpg'; const ext = urls[i].match(/\.(jpg|jpeg|png|webp|gif)/i)?.[1] || 'jpg';
const p = path.join(outputDir, `img_${String(i + 1).padStart(3, '0')}.${ext}`); const p = path.join(outputDir, `img_${String(i + 1).padStart(3, '0')}.${ext}`);
fs.writeFileSync(p, buf); fs.writeFileSync(p, buf);
saved.push(p); saved.push(p);
@ -197,24 +251,30 @@ export async function run(
command: Command, command: Command,
args: string[], args: string[],
dryRun: boolean, dryRun: boolean,
cdpPort: number = 18800, cdpPort: number = 9222,
): Promise<ScrapeResult> { ): Promise<ScrapeResult> {
if (command !== 'scrape') { if (command !== 'scrape') {
return { status: 'failed', url: '', command, dryRun, offerId: '', error: `unknown command: ${command}` }; return { status: 'failed', url: '', command, dryRun, error: `unknown command: ${command}` };
} }
const url = args[0]; const url = args[0];
if (!url) { if (!url) {
return { status: 'failed', url: '', command, dryRun, offerId: '', error: 'scrape requires <url>' }; return { status: 'failed', url: '', command, dryRun, error: 'scrape requires <url>' };
} }
const offerId = extractOfferId(url);
if (dryRun) { if (dryRun) {
return { return {
status: 'success', url, command, dryRun, offerId, status: 'success', url, command, dryRun,
screenshots: [], product: {
title: '<dry-run>',
logistics: {
weight: null, dimensions: null, grossWeight: null, netWeight: null,
packageWeight: null, volume: null, shippingMethod: null, shippingCost: null, origin: null,
},
variants: [],
},
detailImages: [], detailImages: [],
rawAttributes: {},
}; };
} }
@ -224,76 +284,69 @@ export async function run(
await cdp.send('Page.enable'); await cdp.send('Page.enable');
await cdp.send('Runtime.enable'); await cdp.send('Runtime.enable');
await cdp.send('Page.navigate', { url });
// Set wide PC viewport to ensure tables fit without horizontal overflow // Wait for load
await cdp.send('Emulation.setDeviceMetricsOverride', { await new Promise(r => setTimeout(r, 5000));
width: 1920,
height: 1080, const title: string = await cdp.evaluate(JS_EXTRACT_TITLE) || '';
deviceScaleFactor: 2, const rawAttributes: Record<string, string> = JSON.parse(await cdp.evaluate(JS_EXTRACT_ATTRS) || '{}');
mobile: false, const rawVariants: Array<{ name: string; text: string }> = JSON.parse(await cdp.evaluate(JS_EXTRACT_VARIANTS) || '[]');
const imgUrls: string[] = JSON.parse(await cdp.evaluate(JS_EXTRACT_IMAGES) || '[]');
const variants: VariantInfo[] = rawVariants.map(v => {
const weight = parseWeight(v.text);
const dimensions = parseDimensions(v.text);
if (weight) weight.source = 'variant';
if (dimensions) dimensions.source = 'variant';
return { name: v.name, weight, dimensions };
}); });
// Navigate and wait for page load event const logistics: LogisticsData = {
const loadPromise = cdp.waitForEvent('Page.loadEventFired', 30000); weight: null, dimensions: null, grossWeight: null, netWeight: null,
await cdp.send('Page.navigate', { url }); packageWeight: null, volume: null, shippingMethod: null, shippingCost: null, origin: null,
await loadPromise; };
// Wait for networkIdle — poll until no pending requests for 1s for (const [key, val] of Object.entries(rawAttributes)) {
await cdp.evaluate(` if (matchKey(key, ['毛重'])) {
new Promise(resolve => { logistics.grossWeight = parseWeight(val);
let timer; if (logistics.grossWeight) logistics.grossWeight.source = 'attributes';
const reset = () => { clearTimeout(timer); timer = setTimeout(resolve, 1000); }; } else if (matchKey(key, ['净重'])) {
const observer = new PerformanceObserver(() => reset()); logistics.netWeight = parseWeight(val);
observer.observe({ entryTypes: ['resource'] }); if (logistics.netWeight) logistics.netWeight.source = 'attributes';
reset(); } else if (matchKey(key, ['包装重量'])) {
}) logistics.packageWeight = parseWeight(val);
`); if (logistics.packageWeight) logistics.packageWeight.source = 'attributes';
} else if (matchKey(key, WEIGHT_KEYS)) {
// Extract window.context.result.data logistics.weight = parseWeight(val);
let productPackInfo: unknown = null; if (logistics.weight) logistics.weight.source = 'attributes';
let windowContext: unknown = null; }
const ctx = await cdp.evaluate(` if (matchKey(key, DIMENSION_KEYS)) {
(function() { logistics.dimensions = parseDimensions(val);
try { if (logistics.dimensions) logistics.dimensions.source = 'attributes';
const d = window.context && window.context.result && window.context.result.data; }
if (d && d.productPackInfo) { if (matchKey(key, VOLUME_KEYS)) {
return JSON.stringify({ logistics.volume = parseVolume(val);
productPackInfo: d.productPackInfo, if (logistics.volume) logistics.volume.source = 'attributes';
productTitle: d.productTitle || null, }
productAttributes: d.productAttributes || null, if (matchKey(key, ['产地', '发货地', '所在地'])) {
skuSelection: d.skuSelection || null, logistics.origin = val;
}); }
}
} catch(e) {}
return null;
})()
`);
if (ctx) {
const parsed = JSON.parse(ctx);
productPackInfo = parsed.productPackInfo;
windowContext = parsed;
} }
const outputDir = path.join('/tmp', '1688-logistics', offerId); const offerId = extractOfferId(url);
const imgDir = path.join('/tmp', '1688-logistics', offerId);
// Capture full-page screenshots (scrolling) const detailImages = await downloadImages(imgUrls, imgDir);
const screenshotDir = path.join(outputDir, 'screenshots');
const screenshots = await scrollAndCapture(cdp, screenshotDir);
// Download detail images
const imgDir = path.join(outputDir, 'images');
const detailImages = await downloadDetailImages(cdp, imgDir);
return { return {
status: 'success', url, command, dryRun, offerId, status: 'success', url, command, dryRun,
productPackInfo, product: { title, logistics, variants },
windowContext,
screenshots,
detailImages, detailImages,
rawAttributes,
}; };
} catch (error) { } catch (error) {
return { return {
status: 'failed', url, command, dryRun, offerId, status: 'failed', url, command, dryRun,
error: error instanceof Error ? error.message : String(error), error: error instanceof Error ? error.message : String(error),
}; };
} finally { } finally {