Compare commits

..

No commits in common. "main" and "v0.0.5" have entirely different histories.
main ... v0.0.5

2 changed files with 426 additions and 234 deletions

129
SKILL.md
View File

@ -1,97 +1,90 @@
--- ---
name: 1688-logistics-scraper name: 1688-logistics-scraper
description: "Scrape 1688 product pages via Chrome, capture full-page screenshots and detail images for vision-based extraction of weight/size data. Use when the user provides a 1688 product URL and needs logistics specs." description: "Extract product weight/size/logistics data from 1688 product pages via Chrome browser, output structured JSON. Use when the user provides a 1688 product URL and needs logistics specs."
--- ---
# 1688 Logistics Scraper # 1688 Logistics Scraper
Capture 1688 product pages and extract weight/size data via vision. Extract product weight, size, and logistics data from 1688 product pages.
## Run ## Run
```bash ```bash
bun scripts/run.ts scrape <url> [--dry-run] [--port=9222] bun scripts/run.ts scrape <url> [--dry-run]
```
### Examples
```bash
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html'
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dry-run
``` ```
## What It Does ## What It Does
1. Opens the 1688 product URL in the browser (default port 18800) 1. Opens the 1688 product URL in the browser (port 18800)
2. Scrolls through the entire page, capturing full-page screenshots 2. Extracts weight/size data from wherever it appears on the page — product attributes, variant specs, 包装信息, 商品件重尺 table, logistics section
3. Downloads all product detail images 3. Downloads detail images (商品详情图片) for analysis — weight/size is often only in images
4. Saves to `/tmp/1688-logistics/<offer-id>/` 4. Outputs structured JSON
## After Running — MUST follow ## Where To Look For Data
Read ALL screenshots and detail images, then output the following JSON structure. This is the final output for API consumption. Weight/size data on 1688 pages hides in multiple places. Check all before giving up:
1. **Product attributes** (商品属性 / 商品参数) — key-value table, most reliable
2. **商品件重尺 table** — dedicated weight/dimensions/volume table for logistics
3. **包装信息 section** — packaging type, box weight, box dimensions, units per box
4. **Variant/SKU specs** — per-variant weight or size
5. **Logistics section** — shipping weight, volume, freight info
6. **Detail images** — downloaded to `/tmp/1688-logistics/<offer-id>/`, read them to find weight/size text baked into images
## Output
```json ```json
{ {
"offerId": "966107271425", "status": "success",
"url": "https://detail.1688.com/offer/966107271425.html", "url": "https://detail.1688.com/offer/...",
"title": "商品标题", "product": {
"weight": { "title": "产品标题",
"value": 0.15, "logistics": {
"unit": "kg", "weight": { "value": 0.5, "unit": "kg", "source": "attributes" },
"source": "商品属性" "dimensions": { "length": 30, "width": 20, "height": 10, "unit": "cm", "source": "attributes" },
"grossWeight": null,
"netWeight": null,
"packageWeight": { "value": 2.0, "unit": "kg", "source": "packageInfo" },
"volume": null,
"shippingMethod": null,
"shippingCost": null,
"origin": null
}, },
"grossWeight": {
"value": 0.2,
"unit": "kg",
"source": "商品件重尺"
},
"netWeight": {
"value": 0.15,
"unit": "kg",
"source": "商品属性"
},
"dimensions": {
"length": 10,
"width": 8,
"height": 1.8,
"unit": "cm",
"source": "商品属性"
},
"volume": {
"value": 0.000144,
"unit": "m³",
"source": "商品件重尺"
},
"packageWeight": {
"value": 5.0,
"unit": "kg",
"source": "包装信息"
},
"packageDimensions": {
"length": 40,
"width": 30,
"height": 20,
"unit": "cm",
"source": "包装信息"
},
"unitsPerPackage": 50,
"variants": [ "variants": [
{ { "name": "颜色: 红色", "weight": null, "dimensions": null }
"name": "12支装", ],
"weight": { "value": 0.12, "unit": "kg" }, "packageInfo": {
"dimensions": { "length": 9.5, "width": 6, "height": 2.2, "unit": "cm" } "packagingType": "纸箱",
"packagingWeight": { "value": 2.0, "unit": "kg", "source": "packageInfo" },
"packagingDimensions": { "length": 40, "width": 30, "height": 20, "unit": "cm", "source": "packageInfo" },
"unitsPerPackage": 50,
"raw": { "包装方式": "纸箱", "箱规": "40*30*20cm", "装箱数": "50" }
},
"pieceWeightSize": {
"weight": { "value": 0.5, "unit": "kg", "source": "pieceWeightSize" },
"dimensions": { "length": 30, "width": 20, "height": 10, "unit": "cm", "source": "pieceWeightSize" },
"volume": null,
"raw": { "重量": "500g", "尺寸": "30*20*10cm" }
} }
] },
"detailImages": ["/tmp/1688-logistics/852504650877/img_001.jpg"],
"rawAttributes": { "重量": "0.5kg", "尺寸": "30*20*10cm" }
} }
``` ```
### Field rules `null` = not found in text. Check `detailImages` — the data may be in the images.
- **All weight values normalized to kg** (克÷1000, 斤×0.5)
- **All dimension values normalized to cm** (mm÷10)
- **`source`**: where on the page the data was found (商品属性 / 商品件重尺 / 包装信息 / 详情图片)
- **`variants`**: only include if weight/size differs per SKU. Omit if all variants share the same specs.
- **Omit fields that are `null`** — do not include fields where no data was found
- **Do not guess.** Only include values actually visible on the page or in images.
## Rules ## Rules
1. If the browser is not running, report the error. Do not try to launch it. 1. If the browser is not running, report the error. Do not try to launch it.
2. No retries. If it fails, report as-is. 2. Check all data sources before reporting `null`.
3. Read ALL screenshots — logistics data can appear anywhere on the page. 3. Normalize units: 克→kg, 毫米→cm. Keep raw values in `rawAttributes` and `raw` fields.
4. Read detail images too — weight/size is often baked into product photos. 4. No retries. If it fails, report as-is.
5. Output ONLY the structured JSON above. No extra commentary. 5. Trust page content. Do not guess values.

View File

@ -3,16 +3,67 @@ import * as path from 'path';
export type Command = 'scrape'; export type Command = 'scrape';
export interface LogisticsValue {
value: number | null;
unit: string | null;
source: string;
}
export interface Dimensions {
length: number | null;
width: number | null;
height: number | null;
unit: string | null;
source: string;
}
export interface LogisticsData {
weight: LogisticsValue | null;
dimensions: Dimensions | null;
grossWeight: LogisticsValue | null;
netWeight: LogisticsValue | null;
packageWeight: LogisticsValue | null;
volume: LogisticsValue | null;
shippingMethod: string | null;
shippingCost: string | null;
origin: string | null;
}
export interface VariantInfo {
name: string;
weight: LogisticsValue | null;
dimensions: Dimensions | null;
}
export interface PackageInfo {
packagingType: string | null;
packagingWeight: LogisticsValue | null;
packagingDimensions: Dimensions | null;
unitsPerPackage: number | null;
raw: Record<string, string>;
}
export interface PieceWeightSize {
weight: LogisticsValue | null;
dimensions: Dimensions | null;
volume: LogisticsValue | null;
raw: Record<string, string>;
}
export interface ScrapeResult { export interface ScrapeResult {
status: 'success' | 'failed'; status: 'success' | 'failed';
url: string; url: string;
command: Command; command: Command;
dryRun: boolean; dryRun: boolean;
offerId: string; product?: {
productPackInfo?: unknown; title: string;
windowContext?: unknown; logistics: LogisticsData;
screenshots?: string[]; variants: VariantInfo[];
packageInfo: PackageInfo;
pieceWeightSize: PieceWeightSize;
};
detailImages?: string[]; detailImages?: string[];
rawAttributes?: Record<string, string>;
error?: string; error?: string;
} }
@ -28,10 +79,9 @@ class CdpSession {
private ws!: WebSocket; private ws!: WebSocket;
private msgId = 0; private msgId = 0;
private pending = new Map<number, { resolve: (v: any) => void; reject: (e: Error) => void }>(); private pending = new Map<number, { resolve: (v: any) => void; reject: (e: Error) => void }>();
private eventListeners = new Map<string, Array<(params: any) => void>>();
static async connect(port: number): Promise<CdpSession> { static async connect(port: number): Promise<CdpSession> {
const resp = await fetch(`http://localhost:${port}/json`); const resp = await fetch(`http://127.0.0.1:${port}/json`);
const targets = (await resp.json()) as Array<{ webSocketDebuggerUrl: string; type: string }>; const targets = (await resp.json()) as Array<{ webSocketDebuggerUrl: string; type: string }>;
const page = targets.find(t => t.type === 'page'); const page = targets.find(t => t.type === 'page');
if (!page) throw new Error('No Chrome page tab found. Open a tab first.'); if (!page) throw new Error('No Chrome page tab found. Open a tab first.');
@ -46,20 +96,13 @@ class CdpSession {
this.ws.onopen = () => resolve(); this.ws.onopen = () => resolve();
this.ws.onerror = (e: any) => reject(new Error(`WebSocket error: ${e.message || e}`)); this.ws.onerror = (e: any) => reject(new Error(`WebSocket error: ${e.message || e}`));
this.ws.onmessage = (ev: MessageEvent) => { this.ws.onmessage = (ev: MessageEvent) => {
const msg = JSON.parse(typeof ev.data === 'string' ? ev.data : ev.data.toString()); const msg: CdpResult = JSON.parse(typeof ev.data === 'string' ? ev.data : ev.data.toString());
// Handle command responses
if (msg.id != null && this.pending.has(msg.id)) { if (msg.id != null && this.pending.has(msg.id)) {
const p = this.pending.get(msg.id)!; const p = this.pending.get(msg.id)!;
this.pending.delete(msg.id); this.pending.delete(msg.id);
if (msg.error) p.reject(new Error(msg.error.message)); if (msg.error) p.reject(new Error(msg.error.message));
else p.resolve(msg.result); else p.resolve(msg.result);
} }
// Handle events
if (msg.method && this.eventListeners.has(msg.method)) {
for (const fn of this.eventListeners.get(msg.method)!) {
fn(msg.params);
}
}
}; };
}); });
} }
@ -72,117 +115,218 @@ class CdpSession {
}); });
} }
waitForEvent(event: string, timeoutMs: number = 30000): Promise<any> {
return new Promise((resolve, reject) => {
const timer = setTimeout(() => {
cleanup();
reject(new Error(`Timeout waiting for ${event}`));
}, timeoutMs);
const handler = (params: any) => {
cleanup();
resolve(params);
};
const cleanup = () => {
clearTimeout(timer);
const listeners = this.eventListeners.get(event);
if (listeners) {
const idx = listeners.indexOf(handler);
if (idx >= 0) listeners.splice(idx, 1);
}
};
if (!this.eventListeners.has(event)) this.eventListeners.set(event, []);
this.eventListeners.get(event)!.push(handler);
});
}
async evaluate(expression: string): Promise<any> { async evaluate(expression: string): Promise<any> {
const res = await this.send('Runtime.evaluate', { expression, returnByValue: true }); const res = await this.send('Runtime.evaluate', { expression, returnByValue: true });
return res?.result?.value; return res?.result?.value;
} }
async captureScreenshot(format: string = 'png'): Promise<Buffer> {
const res = await this.send('Page.captureScreenshot', {
format,
captureBeyondViewport: false,
});
return Buffer.from(res.data, 'base64');
}
close() { close() {
try { this.ws.close(); } catch {} try { this.ws.close(); } catch {}
} }
} }
// --- Helpers --- // --- Parsers ---
const WEIGHT_KEYS = ['重量', '毛重', '净重', '单件重量', '包装重量', '产品重量', '单品重量', 'weight'];
const DIMENSION_KEYS = ['尺寸', '规格', '长宽高', '外箱尺寸', '包装尺寸', '产品尺寸', '大小', 'size', 'dimensions'];
const VOLUME_KEYS = ['体积', '容积', 'volume'];
function extractOfferId(url: string): string { function extractOfferId(url: string): string {
return url.match(/offer\/(\d+)/)?.[1] || 'unknown'; return url.match(/offer\/(\d+)/)?.[1] || 'unknown';
} }
async function scrollAndCapture( function parseWeight(raw: string): LogisticsValue | null {
cdp: CdpSession, const m = raw.match(/([\d.]+)\s*(kg|g|克|千克|公斤|斤)/i);
outputDir: string, if (!m) return null;
): Promise<string[]> { let value = parseFloat(m[1]);
fs.mkdirSync(outputDir, { recursive: true }); let unit = m[2].toLowerCase();
const saved: string[] = []; if (unit === 'g' || unit === '克') { value /= 1000; unit = 'kg'; }
if (unit === '千克' || unit === '公斤') unit = 'kg';
// Get page height if (unit === '斤') { value *= 0.5; unit = 'kg'; }
const pageHeight: number = await cdp.evaluate( return { value, unit, source: '' };
'Math.max(document.body.scrollHeight, document.documentElement.scrollHeight)'
) || 0;
const viewportHeight: number = await cdp.evaluate('window.innerHeight') || 900;
// Scroll through the page and capture viewport-sized screenshots
// Use 80% step to overlap slightly and avoid missing content at boundaries
const step = Math.floor(viewportHeight * 0.8);
let scrollY = 0;
let idx = 1;
while (scrollY < pageHeight) {
await cdp.evaluate(`window.scrollTo(0, ${scrollY})`);
await new Promise(r => setTimeout(r, 800)); // wait for lazy-load render
const buf = await cdp.captureScreenshot('png');
const filePath = path.join(outputDir, `page_${String(idx).padStart(3, '0')}.png`);
fs.writeFileSync(filePath, buf);
saved.push(filePath);
scrollY += step;
idx++;
} }
return saved; function parseDimensions(raw: string): Dimensions | null {
const m = raw.match(/([\d.]+)\s*[*xX×]\s*([\d.]+)\s*[*xX×]\s*([\d.]+)\s*(cm|mm|毫米|厘米|m|米)?/i);
if (!m) return null;
let [l, w, h] = [parseFloat(m[1]), parseFloat(m[2]), parseFloat(m[3])];
let unit = (m[4] || 'cm').toLowerCase();
if (unit === 'mm' || unit === '毫米') { l /= 10; w /= 10; h /= 10; unit = 'cm'; }
if (unit === '厘米') unit = 'cm';
if (unit === 'm' || unit === '米') { l *= 100; w *= 100; h *= 100; unit = 'cm'; }
return { length: l, width: w, height: h, unit, source: '' };
} }
async function downloadDetailImages( function parseVolume(raw: string): LogisticsValue | null {
cdp: CdpSession, const m = raw.match(/([\d.]+)\s*(m³|cm³|L|ml|升|毫升|立方米|立方厘米)/i);
outputDir: string, if (!m) return null;
): Promise<string[]> { return { value: parseFloat(m[1]), unit: m[2], source: '' };
// Get all detail image URLs from the page }
const imgUrls: string[] = JSON.parse(await cdp.evaluate(`
function matchKey(text: string, keys: string[]): boolean {
const lower = text.toLowerCase();
return keys.some(k => lower.includes(k.toLowerCase()));
}
// --- Page extraction ---
const JS_EXTRACT_ATTRS = `
(function() {
const attrs = {};
const sels = [
'.detail-attributes-list .attributes-item',
'.obj-leading .obj-content li',
'#mod-detail-attributes .attribute-item',
'.detail-info table tr',
'[class*="attribute"] li',
'[class*="param"] li',
'.offer-attr-list .offer-attr-item',
];
for (const sel of sels) {
document.querySelectorAll(sel).forEach(el => {
const parts = el.textContent.trim().split(/[:]/);
if (parts.length >= 2) attrs[parts[0].trim()] = parts.slice(1).join(':').trim();
});
}
document.querySelectorAll('table tr, .detail-attributes-list tr').forEach(tr => {
const cells = tr.querySelectorAll('td, th');
if (cells.length >= 2) attrs[cells[0].textContent.trim()] = cells[1].textContent.trim();
});
return JSON.stringify(attrs);
})()`;
const JS_EXTRACT_VARIANTS = `
(function() {
const variants = [];
const sels = [
'.sku-item-wrapper .sku-item',
'[class*="sku"] [class*="item"]',
'.obj-sku .obj-content li',
'.unit-detail-spec-operator .spec-item',
];
for (const sel of sels) {
document.querySelectorAll(sel).forEach(el => {
const name = el.textContent.trim().replace(/\\s+/g, ' ');
if (name && name.length < 200) variants.push({ name, text: el.textContent });
});
}
return JSON.stringify(variants);
})()`;
const JS_EXTRACT_TITLE = `
(function() {
for (const sel of ['.title-text','.detail-title-text','h1[class*="title"]','.mod-detail-title h1','.d-title']) {
const el = document.querySelector(sel);
if (el && el.textContent.trim()) return el.textContent.trim();
}
return document.title || '';
})()`;
const JS_EXTRACT_IMAGES = `
(function() { (function() {
const imgs = [], seen = new Set(); const imgs = [], seen = new Set();
document.querySelectorAll('img').forEach(img => { const sels = [
'#desc-lazyload-container img',
'.detail-desc-decorate-richtext img',
'[class*="detail-desc"] img',
'.mod-detail-description img',
'.offer-attr-item img',
'.desc-img-loaded img',
];
for (const sel of sels) {
document.querySelectorAll(sel).forEach(img => {
const src = img.src || img.dataset.src || img.dataset.lazySrc || ''; const src = img.src || img.dataset.src || img.dataset.lazySrc || '';
if (src && !seen.has(src) && (src.startsWith('http') || src.startsWith('//'))) { if (src && !seen.has(src) && (src.startsWith('http') || src.startsWith('//'))) {
// Filter for product detail images (skip tiny icons/avatars)
if (img.naturalWidth > 200 || img.width > 200 || !img.complete) {
seen.add(src); seen.add(src);
imgs.push(src.startsWith('//') ? 'https:' + src : src); imgs.push(src.startsWith('//') ? 'https:' + src : src);
} }
});
}
return JSON.stringify(imgs);
})()`;
const JS_EXTRACT_PACKAGE_INFO = `
(function() {
const data = {};
// 包装信息 section — various selector patterns on 1688
const sels = [
'[class*="package-info"] li',
'[class*="packaging"] li',
'[class*="pack-info"] li',
'[class*="baozhuang"] li',
'.detail-packing li',
];
for (const sel of sels) {
document.querySelectorAll(sel).forEach(el => {
const parts = el.textContent.trim().split(/[:]/);
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
});
}
// Also look for table rows inside 包装 sections
document.querySelectorAll('table').forEach(table => {
const header = table.previousElementSibling;
if (header && /包装/.test(header.textContent || '')) {
table.querySelectorAll('tr').forEach(tr => {
const cells = tr.querySelectorAll('td, th');
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
});
} }
}); });
return JSON.stringify(imgs); // Scan all key-value pairs for 包装 related keys
})() document.querySelectorAll('[class*="attribute"] li, [class*="param"] li, .offer-attr-list .offer-attr-item').forEach(el => {
`) || '[]'); const text = el.textContent.trim();
if (/包装/.test(text)) {
const parts = text.split(/[:]/);
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
}
});
return JSON.stringify(data);
})()`;
const JS_EXTRACT_PIECE_WEIGHT_SIZE = `
(function() {
const data = {};
// 商品件重尺 table — dedicated logistics spec table on 1688
const sels = [
'[class*="piece-weight"] tr',
'[class*="jianzhongchi"] tr',
'[class*="weight-size"] tr',
'[class*="logistics-info"] tr',
'[class*="freight-info"] tr',
];
for (const sel of sels) {
document.querySelectorAll(sel).forEach(tr => {
const cells = tr.querySelectorAll('td, th');
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
});
}
// Scan tables preceded by headers containing 件重尺
document.querySelectorAll('table').forEach(table => {
const header = table.previousElementSibling;
if (header && /件重尺|物流|运费/.test(header.textContent || '')) {
table.querySelectorAll('tr').forEach(tr => {
const cells = tr.querySelectorAll('td, th');
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
});
}
});
// Also check spans/divs in logistics area
document.querySelectorAll('[class*="logistics"] [class*="item"], [class*="freight"] [class*="item"]').forEach(el => {
const parts = el.textContent.trim().split(/[:]/);
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
});
return JSON.stringify(data);
})()`;
async function downloadImages(urls: string[], outputDir: string): Promise<string[]> {
fs.mkdirSync(outputDir, { recursive: true }); fs.mkdirSync(outputDir, { recursive: true });
const saved: string[] = []; const saved: string[] = [];
for (let i = 0; i < imgUrls.length; i++) { for (let i = 0; i < urls.length; i++) {
try { try {
const resp = await fetch(imgUrls[i]); const resp = await fetch(urls[i]);
if (!resp.ok) continue; if (!resp.ok) continue;
const buf = Buffer.from(await resp.arrayBuffer()); const buf = Buffer.from(await resp.arrayBuffer());
const ext = imgUrls[i].match(/\.(jpg|jpeg|png|webp|gif)/i)?.[1] || 'jpg'; const ext = urls[i].match(/\.(jpg|jpeg|png|webp|gif)/i)?.[1] || 'jpg';
const p = path.join(outputDir, `img_${String(i + 1).padStart(3, '0')}.${ext}`); const p = path.join(outputDir, `img_${String(i + 1).padStart(3, '0')}.${ext}`);
fs.writeFileSync(p, buf); fs.writeFileSync(p, buf);
saved.push(p); saved.push(p);
@ -200,21 +344,29 @@ export async function run(
cdpPort: number = 18800, cdpPort: number = 18800,
): Promise<ScrapeResult> { ): Promise<ScrapeResult> {
if (command !== 'scrape') { if (command !== 'scrape') {
return { status: 'failed', url: '', command, dryRun, offerId: '', error: `unknown command: ${command}` }; return { status: 'failed', url: '', command, dryRun, error: `unknown command: ${command}` };
} }
const url = args[0]; const url = args[0];
if (!url) { if (!url) {
return { status: 'failed', url: '', command, dryRun, offerId: '', error: 'scrape requires <url>' }; return { status: 'failed', url: '', command, dryRun, error: 'scrape requires <url>' };
} }
const offerId = extractOfferId(url);
if (dryRun) { if (dryRun) {
return { return {
status: 'success', url, command, dryRun, offerId, status: 'success', url, command, dryRun,
screenshots: [], product: {
title: '<dry-run>',
logistics: {
weight: null, dimensions: null, grossWeight: null, netWeight: null,
packageWeight: null, volume: null, shippingMethod: null, shippingCost: null, origin: null,
},
variants: [],
packageInfo: { packagingType: null, packagingWeight: null, packagingDimensions: null, unitsPerPackage: null, raw: {} },
pieceWeightSize: { weight: null, dimensions: null, volume: null, raw: {} },
},
detailImages: [], detailImages: [],
rawAttributes: {},
}; };
} }
@ -224,76 +376,123 @@ export async function run(
await cdp.send('Page.enable'); await cdp.send('Page.enable');
await cdp.send('Runtime.enable'); await cdp.send('Runtime.enable');
// Set wide PC viewport to ensure tables fit without horizontal overflow
await cdp.send('Emulation.setDeviceMetricsOverride', {
width: 1920,
height: 1080,
deviceScaleFactor: 2,
mobile: false,
});
// Navigate and wait for page load event
const loadPromise = cdp.waitForEvent('Page.loadEventFired', 30000);
await cdp.send('Page.navigate', { url }); await cdp.send('Page.navigate', { url });
await loadPromise;
// Wait for networkIdle — poll until no pending requests for 1s // Wait for load
await cdp.evaluate(` await new Promise(r => setTimeout(r, 5000));
new Promise(resolve => {
let timer;
const reset = () => { clearTimeout(timer); timer = setTimeout(resolve, 1000); };
const observer = new PerformanceObserver(() => reset());
observer.observe({ entryTypes: ['resource'] });
reset();
})
`);
// Extract window.context.result.data const title: string = await cdp.evaluate(JS_EXTRACT_TITLE) || '';
let productPackInfo: unknown = null; const rawAttributes: Record<string, string> = JSON.parse(await cdp.evaluate(JS_EXTRACT_ATTRS) || '{}');
let windowContext: unknown = null; const rawVariants: Array<{ name: string; text: string }> = JSON.parse(await cdp.evaluate(JS_EXTRACT_VARIANTS) || '[]');
const ctx = await cdp.evaluate(` const imgUrls: string[] = JSON.parse(await cdp.evaluate(JS_EXTRACT_IMAGES) || '[]');
(function() {
try { const variants: VariantInfo[] = rawVariants.map(v => {
const d = window.context && window.context.result && window.context.result.data; const weight = parseWeight(v.text);
if (d && d.productPackInfo) { const dimensions = parseDimensions(v.text);
return JSON.stringify({ if (weight) weight.source = 'variant';
productPackInfo: d.productPackInfo, if (dimensions) dimensions.source = 'variant';
productTitle: d.productTitle || null, return { name: v.name, weight, dimensions };
productAttributes: d.productAttributes || null,
skuSelection: d.skuSelection || null,
}); });
const logistics: LogisticsData = {
weight: null, dimensions: null, grossWeight: null, netWeight: null,
packageWeight: null, volume: null, shippingMethod: null, shippingCost: null, origin: null,
};
for (const [key, val] of Object.entries(rawAttributes)) {
if (matchKey(key, ['毛重'])) {
logistics.grossWeight = parseWeight(val);
if (logistics.grossWeight) logistics.grossWeight.source = 'attributes';
} else if (matchKey(key, ['净重'])) {
logistics.netWeight = parseWeight(val);
if (logistics.netWeight) logistics.netWeight.source = 'attributes';
} else if (matchKey(key, ['包装重量'])) {
logistics.packageWeight = parseWeight(val);
if (logistics.packageWeight) logistics.packageWeight.source = 'attributes';
} else if (matchKey(key, WEIGHT_KEYS)) {
logistics.weight = parseWeight(val);
if (logistics.weight) logistics.weight.source = 'attributes';
}
if (matchKey(key, DIMENSION_KEYS)) {
logistics.dimensions = parseDimensions(val);
if (logistics.dimensions) logistics.dimensions.source = 'attributes';
}
if (matchKey(key, VOLUME_KEYS)) {
logistics.volume = parseVolume(val);
if (logistics.volume) logistics.volume.source = 'attributes';
}
if (matchKey(key, ['产地', '发货地', '所在地'])) {
logistics.origin = val;
} }
} catch(e) {}
return null;
})()
`);
if (ctx) {
const parsed = JSON.parse(ctx);
productPackInfo = parsed.productPackInfo;
windowContext = parsed;
} }
const outputDir = path.join('/tmp', '1688-logistics', offerId); // Extract 包装信息
const rawPkgInfo: Record<string, string> = JSON.parse(await cdp.evaluate(JS_EXTRACT_PACKAGE_INFO) || '{}');
const packageInfo: PackageInfo = {
packagingType: null,
packagingWeight: null,
packagingDimensions: null,
unitsPerPackage: null,
raw: rawPkgInfo,
};
for (const [key, val] of Object.entries(rawPkgInfo)) {
if (matchKey(key, ['包装方式', '包装类型', '包装形式'])) packageInfo.packagingType = val;
if (matchKey(key, ['包装重量', '箱重'])) {
packageInfo.packagingWeight = parseWeight(val);
if (packageInfo.packagingWeight) packageInfo.packagingWeight.source = 'packageInfo';
}
if (matchKey(key, ['包装尺寸', '外箱尺寸', '箱规'])) {
packageInfo.packagingDimensions = parseDimensions(val);
if (packageInfo.packagingDimensions) packageInfo.packagingDimensions.source = 'packageInfo';
}
if (matchKey(key, ['装箱数', '每箱数量', '入数'])) {
const n = parseInt(val, 10);
if (!isNaN(n)) packageInfo.unitsPerPackage = n;
}
}
// Capture full-page screenshots (scrolling) // Extract 商品件重尺
const screenshotDir = path.join(outputDir, 'screenshots'); const rawPws: Record<string, string> = JSON.parse(await cdp.evaluate(JS_EXTRACT_PIECE_WEIGHT_SIZE) || '{}');
const screenshots = await scrollAndCapture(cdp, screenshotDir); const pieceWeightSize: PieceWeightSize = {
weight: null,
dimensions: null,
volume: null,
raw: rawPws,
};
for (const [key, val] of Object.entries(rawPws)) {
if (matchKey(key, WEIGHT_KEYS)) {
pieceWeightSize.weight = parseWeight(val);
if (pieceWeightSize.weight) pieceWeightSize.weight.source = 'pieceWeightSize';
}
if (matchKey(key, DIMENSION_KEYS)) {
pieceWeightSize.dimensions = parseDimensions(val);
if (pieceWeightSize.dimensions) pieceWeightSize.dimensions.source = 'pieceWeightSize';
}
if (matchKey(key, VOLUME_KEYS)) {
pieceWeightSize.volume = parseVolume(val);
if (pieceWeightSize.volume) pieceWeightSize.volume.source = 'pieceWeightSize';
}
}
// Download detail images // Backfill logistics from pieceWeightSize if not found in attributes
const imgDir = path.join(outputDir, 'images'); if (!logistics.weight && pieceWeightSize.weight) logistics.weight = pieceWeightSize.weight;
const detailImages = await downloadDetailImages(cdp, imgDir); if (!logistics.dimensions && pieceWeightSize.dimensions) logistics.dimensions = pieceWeightSize.dimensions;
if (!logistics.volume && pieceWeightSize.volume) logistics.volume = pieceWeightSize.volume;
if (!logistics.packageWeight && packageInfo.packagingWeight) logistics.packageWeight = packageInfo.packagingWeight;
const offerId = extractOfferId(url);
const imgDir = path.join('/tmp', '1688-logistics', offerId);
const detailImages = await downloadImages(imgUrls, imgDir);
return { return {
status: 'success', url, command, dryRun, offerId, status: 'success', url, command, dryRun,
productPackInfo, product: { title, logistics, variants, packageInfo, pieceWeightSize },
windowContext,
screenshots,
detailImages, detailImages,
rawAttributes,
}; };
} catch (error) { } catch (error) {
return { return {
status: 'failed', url, command, dryRun, offerId, status: 'failed', url, command, dryRun,
error: error instanceof Error ? error.message : String(error), error: error instanceof Error ? error.message : String(error),
}; };
} finally { } finally {