feat: default CDP port 18800, add 包装信息 and 商品件重尺 extraction
register-skill-release / register (push) Successful in 14s Details

- Change default CDP port from 9222 to 18800
- Extract 包装信息 section (packaging type, box weight/dims, units per box)
- Extract 商品件重尺 table (per-piece weight/dimensions/volume)
- Backfill logistics from pieceWeightSize/packageInfo when attributes missing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
ywkj 2026-03-30 08:20:46 +08:00
parent 9d147242e0
commit e48592690b
3 changed files with 173 additions and 15 deletions

View File

@ -22,8 +22,8 @@ bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dr
## What It Does ## What It Does
1. Opens the 1688 product URL in the browser 1. Opens the 1688 product URL in the browser (port 18800)
2. Extracts weight/size data from wherever it appears on the page — product attributes, variant specs, logistics section 2. Extracts weight/size data from wherever it appears on the page — product attributes, variant specs, 包装信息, 商品件重尺 table, logistics section
3. Downloads detail images (商品详情图片) for analysis — weight/size is often only in images 3. Downloads detail images (商品详情图片) for analysis — weight/size is often only in images
4. Outputs structured JSON 4. Outputs structured JSON
@ -32,9 +32,11 @@ bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dr
Weight/size data on 1688 pages hides in multiple places. Check all before giving up: Weight/size data on 1688 pages hides in multiple places. Check all before giving up:
1. **Product attributes** (商品属性 / 商品参数) — key-value table, most reliable 1. **Product attributes** (商品属性 / 商品参数) — key-value table, most reliable
2. **Variant/SKU specs** — per-variant weight or size 2. **商品件重尺 table** — dedicated weight/dimensions/volume table for logistics
3. **Logistics section** — shipping weight, volume, freight info 3. **包装信息 section** — packaging type, box weight, box dimensions, units per box
4. **Detail images** — downloaded to `/tmp/1688-logistics/<offer-id>/`, read them to find weight/size text baked into images 4. **Variant/SKU specs** — per-variant weight or size
5. **Logistics section** — shipping weight, volume, freight info
6. **Detail images** — downloaded to `/tmp/1688-logistics/<offer-id>/`, read them to find weight/size text baked into images
## Output ## Output
@ -49,7 +51,7 @@ Weight/size data on 1688 pages hides in multiple places. Check all before giving
"dimensions": { "length": 30, "width": 20, "height": 10, "unit": "cm", "source": "attributes" }, "dimensions": { "length": 30, "width": 20, "height": 10, "unit": "cm", "source": "attributes" },
"grossWeight": null, "grossWeight": null,
"netWeight": null, "netWeight": null,
"packageWeight": null, "packageWeight": { "value": 2.0, "unit": "kg", "source": "packageInfo" },
"volume": null, "volume": null,
"shippingMethod": null, "shippingMethod": null,
"shippingCost": null, "shippingCost": null,
@ -57,7 +59,20 @@ Weight/size data on 1688 pages hides in multiple places. Check all before giving
}, },
"variants": [ "variants": [
{ "name": "颜色: 红色", "weight": null, "dimensions": null } { "name": "颜色: 红色", "weight": null, "dimensions": null }
] ],
"packageInfo": {
"packagingType": "纸箱",
"packagingWeight": { "value": 2.0, "unit": "kg", "source": "packageInfo" },
"packagingDimensions": { "length": 40, "width": 30, "height": 20, "unit": "cm", "source": "packageInfo" },
"unitsPerPackage": 50,
"raw": { "包装方式": "纸箱", "箱规": "40*30*20cm", "装箱数": "50" }
},
"pieceWeightSize": {
"weight": { "value": 0.5, "unit": "kg", "source": "pieceWeightSize" },
"dimensions": { "length": 30, "width": 20, "height": 10, "unit": "cm", "source": "pieceWeightSize" },
"volume": null,
"raw": { "重量": "500g", "尺寸": "30*20*10cm" }
}
}, },
"detailImages": ["/tmp/1688-logistics/852504650877/img_001.jpg"], "detailImages": ["/tmp/1688-logistics/852504650877/img_001.jpg"],
"rawAttributes": { "重量": "0.5kg", "尺寸": "30*20*10cm" } "rawAttributes": { "重量": "0.5kg", "尺寸": "30*20*10cm" }
@ -70,6 +85,6 @@ Weight/size data on 1688 pages hides in multiple places. Check all before giving
1. If the browser is not running, report the error. Do not try to launch it. 1. If the browser is not running, report the error. Do not try to launch it.
2. Check all data sources before reporting `null`. 2. Check all data sources before reporting `null`.
3. Normalize units: 克→kg, 毫米→cm. Keep raw values in `rawAttributes`. 3. Normalize units: 克→kg, 毫米→cm. Keep raw values in `rawAttributes` and `raw` fields.
4. No retries. If it fails, report as-is. 4. No retries. If it fails, report as-is.
5. Trust page content. Do not guess values. 5. Trust page content. Do not guess values.

View File

@ -12,17 +12,14 @@ Commands:
Examples: Examples:
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html'
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dry-run bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dry-run
bun scripts/run.ts --port=9223 scrape 'https://detail.1688.com/offer/852504650877.html' bun scripts/run.ts --port=18801 scrape 'https://detail.1688.com/offer/852504650877.html'
Prerequisites:
Chrome must be running with --remote-debugging-port=9222
`); `);
} }
async function main(): Promise<void> { async function main(): Promise<void> {
const positionals: string[] = []; const positionals: string[] = [];
let dryRun = false; let dryRun = false;
let port = 9222; let port = 18800;
for (const arg of process.argv.slice(2)) { for (const arg of process.argv.slice(2)) {
if (arg === '--dry-run') { if (arg === '--dry-run') {

View File

@ -35,6 +35,21 @@ export interface VariantInfo {
dimensions: Dimensions | null; dimensions: Dimensions | null;
} }
export interface PackageInfo {
packagingType: string | null;
packagingWeight: LogisticsValue | null;
packagingDimensions: Dimensions | null;
unitsPerPackage: number | null;
raw: Record<string, string>;
}
export interface PieceWeightSize {
weight: LogisticsValue | null;
dimensions: Dimensions | null;
volume: LogisticsValue | null;
raw: Record<string, string>;
}
export interface ScrapeResult { export interface ScrapeResult {
status: 'success' | 'failed'; status: 'success' | 'failed';
url: string; url: string;
@ -44,6 +59,8 @@ export interface ScrapeResult {
title: string; title: string;
logistics: LogisticsData; logistics: LogisticsData;
variants: VariantInfo[]; variants: VariantInfo[];
packageInfo: PackageInfo;
pieceWeightSize: PieceWeightSize;
}; };
detailImages?: string[]; detailImages?: string[];
rawAttributes?: Record<string, string>; rawAttributes?: Record<string, string>;
@ -228,6 +245,79 @@ const JS_EXTRACT_IMAGES = `
return JSON.stringify(imgs); return JSON.stringify(imgs);
})()`; })()`;
const JS_EXTRACT_PACKAGE_INFO = `
(function() {
const data = {};
// 包装信息 section — various selector patterns on 1688
const sels = [
'[class*="package-info"] li',
'[class*="packaging"] li',
'[class*="pack-info"] li',
'[class*="baozhuang"] li',
'.detail-packing li',
];
for (const sel of sels) {
document.querySelectorAll(sel).forEach(el => {
const parts = el.textContent.trim().split(/[:]/);
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
});
}
// Also look for table rows inside 包装 sections
document.querySelectorAll('table').forEach(table => {
const header = table.previousElementSibling;
if (header && /包装/.test(header.textContent || '')) {
table.querySelectorAll('tr').forEach(tr => {
const cells = tr.querySelectorAll('td, th');
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
});
}
});
// Scan all key-value pairs for 包装 related keys
document.querySelectorAll('[class*="attribute"] li, [class*="param"] li, .offer-attr-list .offer-attr-item').forEach(el => {
const text = el.textContent.trim();
if (/包装/.test(text)) {
const parts = text.split(/[:]/);
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
}
});
return JSON.stringify(data);
})()`;
const JS_EXTRACT_PIECE_WEIGHT_SIZE = `
(function() {
const data = {};
// 商品件重尺 table — dedicated logistics spec table on 1688
const sels = [
'[class*="piece-weight"] tr',
'[class*="jianzhongchi"] tr',
'[class*="weight-size"] tr',
'[class*="logistics-info"] tr',
'[class*="freight-info"] tr',
];
for (const sel of sels) {
document.querySelectorAll(sel).forEach(tr => {
const cells = tr.querySelectorAll('td, th');
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
});
}
// Scan tables preceded by headers containing 件重尺
document.querySelectorAll('table').forEach(table => {
const header = table.previousElementSibling;
if (header && /件重尺|物流|运费/.test(header.textContent || '')) {
table.querySelectorAll('tr').forEach(tr => {
const cells = tr.querySelectorAll('td, th');
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
});
}
});
// Also check spans/divs in logistics area
document.querySelectorAll('[class*="logistics"] [class*="item"], [class*="freight"] [class*="item"]').forEach(el => {
const parts = el.textContent.trim().split(/[:]/);
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
});
return JSON.stringify(data);
})()`;
async function downloadImages(urls: string[], outputDir: string): Promise<string[]> { async function downloadImages(urls: string[], outputDir: string): Promise<string[]> {
fs.mkdirSync(outputDir, { recursive: true }); fs.mkdirSync(outputDir, { recursive: true });
const saved: string[] = []; const saved: string[] = [];
@ -251,7 +341,7 @@ export async function run(
command: Command, command: Command,
args: string[], args: string[],
dryRun: boolean, dryRun: boolean,
cdpPort: number = 9222, cdpPort: number = 18800,
): Promise<ScrapeResult> { ): Promise<ScrapeResult> {
if (command !== 'scrape') { if (command !== 'scrape') {
return { status: 'failed', url: '', command, dryRun, error: `unknown command: ${command}` }; return { status: 'failed', url: '', command, dryRun, error: `unknown command: ${command}` };
@ -272,6 +362,8 @@ export async function run(
packageWeight: null, volume: null, shippingMethod: null, shippingCost: null, origin: null, packageWeight: null, volume: null, shippingMethod: null, shippingCost: null, origin: null,
}, },
variants: [], variants: [],
packageInfo: { packagingType: null, packagingWeight: null, packagingDimensions: null, unitsPerPackage: null, raw: {} },
pieceWeightSize: { weight: null, dimensions: null, volume: null, raw: {} },
}, },
detailImages: [], detailImages: [],
rawAttributes: {}, rawAttributes: {},
@ -334,13 +426,67 @@ export async function run(
} }
} }
// Extract 包装信息
const rawPkgInfo: Record<string, string> = JSON.parse(await cdp.evaluate(JS_EXTRACT_PACKAGE_INFO) || '{}');
const packageInfo: PackageInfo = {
packagingType: null,
packagingWeight: null,
packagingDimensions: null,
unitsPerPackage: null,
raw: rawPkgInfo,
};
for (const [key, val] of Object.entries(rawPkgInfo)) {
if (matchKey(key, ['包装方式', '包装类型', '包装形式'])) packageInfo.packagingType = val;
if (matchKey(key, ['包装重量', '箱重'])) {
packageInfo.packagingWeight = parseWeight(val);
if (packageInfo.packagingWeight) packageInfo.packagingWeight.source = 'packageInfo';
}
if (matchKey(key, ['包装尺寸', '外箱尺寸', '箱规'])) {
packageInfo.packagingDimensions = parseDimensions(val);
if (packageInfo.packagingDimensions) packageInfo.packagingDimensions.source = 'packageInfo';
}
if (matchKey(key, ['装箱数', '每箱数量', '入数'])) {
const n = parseInt(val, 10);
if (!isNaN(n)) packageInfo.unitsPerPackage = n;
}
}
// Extract 商品件重尺
const rawPws: Record<string, string> = JSON.parse(await cdp.evaluate(JS_EXTRACT_PIECE_WEIGHT_SIZE) || '{}');
const pieceWeightSize: PieceWeightSize = {
weight: null,
dimensions: null,
volume: null,
raw: rawPws,
};
for (const [key, val] of Object.entries(rawPws)) {
if (matchKey(key, WEIGHT_KEYS)) {
pieceWeightSize.weight = parseWeight(val);
if (pieceWeightSize.weight) pieceWeightSize.weight.source = 'pieceWeightSize';
}
if (matchKey(key, DIMENSION_KEYS)) {
pieceWeightSize.dimensions = parseDimensions(val);
if (pieceWeightSize.dimensions) pieceWeightSize.dimensions.source = 'pieceWeightSize';
}
if (matchKey(key, VOLUME_KEYS)) {
pieceWeightSize.volume = parseVolume(val);
if (pieceWeightSize.volume) pieceWeightSize.volume.source = 'pieceWeightSize';
}
}
// Backfill logistics from pieceWeightSize if not found in attributes
if (!logistics.weight && pieceWeightSize.weight) logistics.weight = pieceWeightSize.weight;
if (!logistics.dimensions && pieceWeightSize.dimensions) logistics.dimensions = pieceWeightSize.dimensions;
if (!logistics.volume && pieceWeightSize.volume) logistics.volume = pieceWeightSize.volume;
if (!logistics.packageWeight && packageInfo.packagingWeight) logistics.packageWeight = packageInfo.packagingWeight;
const offerId = extractOfferId(url); const offerId = extractOfferId(url);
const imgDir = path.join('/tmp', '1688-logistics', offerId); const imgDir = path.join('/tmp', '1688-logistics', offerId);
const detailImages = await downloadImages(imgUrls, imgDir); const detailImages = await downloadImages(imgUrls, imgDir);
return { return {
status: 'success', url, command, dryRun, status: 'success', url, command, dryRun,
product: { title, logistics, variants }, product: { title, logistics, variants, packageInfo, pieceWeightSize },
detailImages, detailImages,
rawAttributes, rawAttributes,
}; };