feat: default CDP port 18800, add 包装信息 and 商品件重尺 extraction
register-skill-release / register (push) Successful in 14s Details

- Change default CDP port from 9222 to 18800
- Extract 包装信息 section (packaging type, box weight/dims, units per box)
- Extract 商品件重尺 table (per-piece weight/dimensions/volume)
- Backfill logistics from pieceWeightSize/packageInfo when attributes missing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
ywkj 2026-03-30 08:20:46 +08:00
parent 9d147242e0
commit e48592690b
3 changed files with 173 additions and 15 deletions

View File

@ -22,8 +22,8 @@ bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dr
## What It Does
1. Opens the 1688 product URL in the browser
2. Extracts weight/size data from wherever it appears on the page — product attributes, variant specs, logistics section
1. Opens the 1688 product URL in the browser (port 18800)
2. Extracts weight/size data from wherever it appears on the page — product attributes, variant specs, 包装信息, 商品件重尺 table, logistics section
3. Downloads detail images (商品详情图片) for analysis — weight/size is often only in images
4. Outputs structured JSON
@ -32,9 +32,11 @@ bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dr
Weight/size data on 1688 pages hides in multiple places. Check all before giving up:
1. **Product attributes** (商品属性 / 商品参数) — key-value table, most reliable
2. **Variant/SKU specs** — per-variant weight or size
3. **Logistics section** — shipping weight, volume, freight info
4. **Detail images** — downloaded to `/tmp/1688-logistics/<offer-id>/`, read them to find weight/size text baked into images
2. **商品件重尺 table** — dedicated weight/dimensions/volume table for logistics
3. **包装信息 section** — packaging type, box weight, box dimensions, units per box
4. **Variant/SKU specs** — per-variant weight or size
5. **Logistics section** — shipping weight, volume, freight info
6. **Detail images** — downloaded to `/tmp/1688-logistics/<offer-id>/`, read them to find weight/size text baked into images
## Output
@ -49,7 +51,7 @@ Weight/size data on 1688 pages hides in multiple places. Check all before giving
"dimensions": { "length": 30, "width": 20, "height": 10, "unit": "cm", "source": "attributes" },
"grossWeight": null,
"netWeight": null,
"packageWeight": null,
"packageWeight": { "value": 2.0, "unit": "kg", "source": "packageInfo" },
"volume": null,
"shippingMethod": null,
"shippingCost": null,
@ -57,7 +59,20 @@ Weight/size data on 1688 pages hides in multiple places. Check all before giving
},
"variants": [
{ "name": "颜色: 红色", "weight": null, "dimensions": null }
]
],
"packageInfo": {
"packagingType": "纸箱",
"packagingWeight": { "value": 2.0, "unit": "kg", "source": "packageInfo" },
"packagingDimensions": { "length": 40, "width": 30, "height": 20, "unit": "cm", "source": "packageInfo" },
"unitsPerPackage": 50,
"raw": { "包装方式": "纸箱", "箱规": "40*30*20cm", "装箱数": "50" }
},
"pieceWeightSize": {
"weight": { "value": 0.5, "unit": "kg", "source": "pieceWeightSize" },
"dimensions": { "length": 30, "width": 20, "height": 10, "unit": "cm", "source": "pieceWeightSize" },
"volume": null,
"raw": { "重量": "500g", "尺寸": "30*20*10cm" }
}
},
"detailImages": ["/tmp/1688-logistics/852504650877/img_001.jpg"],
"rawAttributes": { "重量": "0.5kg", "尺寸": "30*20*10cm" }
@ -70,6 +85,6 @@ Weight/size data on 1688 pages hides in multiple places. Check all before giving
1. If the browser is not running, report the error. Do not try to launch it.
2. Check all data sources before reporting `null`.
3. Normalize units: 克→kg, 毫米→cm. Keep raw values in `rawAttributes`.
3. Normalize units: 克→kg, 毫米→cm. Keep raw values in `rawAttributes` and `raw` fields.
4. No retries. If it fails, report as-is.
5. Trust page content. Do not guess values.

View File

@ -12,17 +12,14 @@ Commands:
Examples:
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html'
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dry-run
bun scripts/run.ts --port=9223 scrape 'https://detail.1688.com/offer/852504650877.html'
Prerequisites:
Chrome must be running with --remote-debugging-port=9222
bun scripts/run.ts --port=18801 scrape 'https://detail.1688.com/offer/852504650877.html'
`);
}
async function main(): Promise<void> {
const positionals: string[] = [];
let dryRun = false;
let port = 9222;
let port = 18800;
for (const arg of process.argv.slice(2)) {
if (arg === '--dry-run') {

View File

@ -35,6 +35,21 @@ export interface VariantInfo {
dimensions: Dimensions | null;
}
export interface PackageInfo {
packagingType: string | null;
packagingWeight: LogisticsValue | null;
packagingDimensions: Dimensions | null;
unitsPerPackage: number | null;
raw: Record<string, string>;
}
export interface PieceWeightSize {
weight: LogisticsValue | null;
dimensions: Dimensions | null;
volume: LogisticsValue | null;
raw: Record<string, string>;
}
export interface ScrapeResult {
status: 'success' | 'failed';
url: string;
@ -44,6 +59,8 @@ export interface ScrapeResult {
title: string;
logistics: LogisticsData;
variants: VariantInfo[];
packageInfo: PackageInfo;
pieceWeightSize: PieceWeightSize;
};
detailImages?: string[];
rawAttributes?: Record<string, string>;
@ -228,6 +245,79 @@ const JS_EXTRACT_IMAGES = `
return JSON.stringify(imgs);
})()`;
const JS_EXTRACT_PACKAGE_INFO = `
(function() {
const data = {};
// 包装信息 section — various selector patterns on 1688
const sels = [
'[class*="package-info"] li',
'[class*="packaging"] li',
'[class*="pack-info"] li',
'[class*="baozhuang"] li',
'.detail-packing li',
];
for (const sel of sels) {
document.querySelectorAll(sel).forEach(el => {
const parts = el.textContent.trim().split(/[:]/);
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
});
}
// Also look for table rows inside 包装 sections
document.querySelectorAll('table').forEach(table => {
const header = table.previousElementSibling;
if (header && /包装/.test(header.textContent || '')) {
table.querySelectorAll('tr').forEach(tr => {
const cells = tr.querySelectorAll('td, th');
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
});
}
});
// Scan all key-value pairs for 包装 related keys
document.querySelectorAll('[class*="attribute"] li, [class*="param"] li, .offer-attr-list .offer-attr-item').forEach(el => {
const text = el.textContent.trim();
if (/包装/.test(text)) {
const parts = text.split(/[:]/);
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
}
});
return JSON.stringify(data);
})()`;
const JS_EXTRACT_PIECE_WEIGHT_SIZE = `
(function() {
const data = {};
// 商品件重尺 table — dedicated logistics spec table on 1688
const sels = [
'[class*="piece-weight"] tr',
'[class*="jianzhongchi"] tr',
'[class*="weight-size"] tr',
'[class*="logistics-info"] tr',
'[class*="freight-info"] tr',
];
for (const sel of sels) {
document.querySelectorAll(sel).forEach(tr => {
const cells = tr.querySelectorAll('td, th');
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
});
}
// Scan tables preceded by headers containing 件重尺
document.querySelectorAll('table').forEach(table => {
const header = table.previousElementSibling;
if (header && /件重尺|物流|运费/.test(header.textContent || '')) {
table.querySelectorAll('tr').forEach(tr => {
const cells = tr.querySelectorAll('td, th');
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
});
}
});
// Also check spans/divs in logistics area
document.querySelectorAll('[class*="logistics"] [class*="item"], [class*="freight"] [class*="item"]').forEach(el => {
const parts = el.textContent.trim().split(/[:]/);
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
});
return JSON.stringify(data);
})()`;
async function downloadImages(urls: string[], outputDir: string): Promise<string[]> {
fs.mkdirSync(outputDir, { recursive: true });
const saved: string[] = [];
@ -251,7 +341,7 @@ export async function run(
command: Command,
args: string[],
dryRun: boolean,
cdpPort: number = 9222,
cdpPort: number = 18800,
): Promise<ScrapeResult> {
if (command !== 'scrape') {
return { status: 'failed', url: '', command, dryRun, error: `unknown command: ${command}` };
@ -272,6 +362,8 @@ export async function run(
packageWeight: null, volume: null, shippingMethod: null, shippingCost: null, origin: null,
},
variants: [],
packageInfo: { packagingType: null, packagingWeight: null, packagingDimensions: null, unitsPerPackage: null, raw: {} },
pieceWeightSize: { weight: null, dimensions: null, volume: null, raw: {} },
},
detailImages: [],
rawAttributes: {},
@ -334,13 +426,67 @@ export async function run(
}
}
// Extract 包装信息
const rawPkgInfo: Record<string, string> = JSON.parse(await cdp.evaluate(JS_EXTRACT_PACKAGE_INFO) || '{}');
const packageInfo: PackageInfo = {
packagingType: null,
packagingWeight: null,
packagingDimensions: null,
unitsPerPackage: null,
raw: rawPkgInfo,
};
for (const [key, val] of Object.entries(rawPkgInfo)) {
if (matchKey(key, ['包装方式', '包装类型', '包装形式'])) packageInfo.packagingType = val;
if (matchKey(key, ['包装重量', '箱重'])) {
packageInfo.packagingWeight = parseWeight(val);
if (packageInfo.packagingWeight) packageInfo.packagingWeight.source = 'packageInfo';
}
if (matchKey(key, ['包装尺寸', '外箱尺寸', '箱规'])) {
packageInfo.packagingDimensions = parseDimensions(val);
if (packageInfo.packagingDimensions) packageInfo.packagingDimensions.source = 'packageInfo';
}
if (matchKey(key, ['装箱数', '每箱数量', '入数'])) {
const n = parseInt(val, 10);
if (!isNaN(n)) packageInfo.unitsPerPackage = n;
}
}
// Extract 商品件重尺
const rawPws: Record<string, string> = JSON.parse(await cdp.evaluate(JS_EXTRACT_PIECE_WEIGHT_SIZE) || '{}');
const pieceWeightSize: PieceWeightSize = {
weight: null,
dimensions: null,
volume: null,
raw: rawPws,
};
for (const [key, val] of Object.entries(rawPws)) {
if (matchKey(key, WEIGHT_KEYS)) {
pieceWeightSize.weight = parseWeight(val);
if (pieceWeightSize.weight) pieceWeightSize.weight.source = 'pieceWeightSize';
}
if (matchKey(key, DIMENSION_KEYS)) {
pieceWeightSize.dimensions = parseDimensions(val);
if (pieceWeightSize.dimensions) pieceWeightSize.dimensions.source = 'pieceWeightSize';
}
if (matchKey(key, VOLUME_KEYS)) {
pieceWeightSize.volume = parseVolume(val);
if (pieceWeightSize.volume) pieceWeightSize.volume.source = 'pieceWeightSize';
}
}
// Backfill logistics from pieceWeightSize if not found in attributes
if (!logistics.weight && pieceWeightSize.weight) logistics.weight = pieceWeightSize.weight;
if (!logistics.dimensions && pieceWeightSize.dimensions) logistics.dimensions = pieceWeightSize.dimensions;
if (!logistics.volume && pieceWeightSize.volume) logistics.volume = pieceWeightSize.volume;
if (!logistics.packageWeight && packageInfo.packagingWeight) logistics.packageWeight = packageInfo.packagingWeight;
const offerId = extractOfferId(url);
const imgDir = path.join('/tmp', '1688-logistics', offerId);
const detailImages = await downloadImages(imgUrls, imgDir);
return {
status: 'success', url, command, dryRun,
product: { title, logistics, variants },
product: { title, logistics, variants, packageInfo, pieceWeightSize },
detailImages,
rawAttributes,
};