feat: default CDP port 18800, add 包装信息 and 商品件重尺 extraction
register-skill-release / register (push) Successful in 14s
Details
register-skill-release / register (push) Successful in 14s
Details
- Change default CDP port from 9222 to 18800 - Extract 包装信息 section (packaging type, box weight/dims, units per box) - Extract 商品件重尺 table (per-piece weight/dimensions/volume) - Backfill logistics from pieceWeightSize/packageInfo when attributes missing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
9d147242e0
commit
e48592690b
31
SKILL.md
31
SKILL.md
|
|
@ -22,8 +22,8 @@ bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dr
|
|||
|
||||
## What It Does
|
||||
|
||||
1. Opens the 1688 product URL in the browser
|
||||
2. Extracts weight/size data from wherever it appears on the page — product attributes, variant specs, logistics section
|
||||
1. Opens the 1688 product URL in the browser (port 18800)
|
||||
2. Extracts weight/size data from wherever it appears on the page — product attributes, variant specs, 包装信息, 商品件重尺 table, logistics section
|
||||
3. Downloads detail images (商品详情图片) for analysis — weight/size is often only in images
|
||||
4. Outputs structured JSON
|
||||
|
||||
|
|
@ -32,9 +32,11 @@ bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dr
|
|||
Weight/size data on 1688 pages hides in multiple places. Check all before giving up:
|
||||
|
||||
1. **Product attributes** (商品属性 / 商品参数) — key-value table, most reliable
|
||||
2. **Variant/SKU specs** — per-variant weight or size
|
||||
3. **Logistics section** — shipping weight, volume, freight info
|
||||
4. **Detail images** — downloaded to `/tmp/1688-logistics/<offer-id>/`, read them to find weight/size text baked into images
|
||||
2. **商品件重尺 table** — dedicated weight/dimensions/volume table for logistics
|
||||
3. **包装信息 section** — packaging type, box weight, box dimensions, units per box
|
||||
4. **Variant/SKU specs** — per-variant weight or size
|
||||
5. **Logistics section** — shipping weight, volume, freight info
|
||||
6. **Detail images** — downloaded to `/tmp/1688-logistics/<offer-id>/`, read them to find weight/size text baked into images
|
||||
|
||||
## Output
|
||||
|
||||
|
|
@ -49,7 +51,7 @@ Weight/size data on 1688 pages hides in multiple places. Check all before giving
|
|||
"dimensions": { "length": 30, "width": 20, "height": 10, "unit": "cm", "source": "attributes" },
|
||||
"grossWeight": null,
|
||||
"netWeight": null,
|
||||
"packageWeight": null,
|
||||
"packageWeight": { "value": 2.0, "unit": "kg", "source": "packageInfo" },
|
||||
"volume": null,
|
||||
"shippingMethod": null,
|
||||
"shippingCost": null,
|
||||
|
|
@ -57,7 +59,20 @@ Weight/size data on 1688 pages hides in multiple places. Check all before giving
|
|||
},
|
||||
"variants": [
|
||||
{ "name": "颜色: 红色", "weight": null, "dimensions": null }
|
||||
]
|
||||
],
|
||||
"packageInfo": {
|
||||
"packagingType": "纸箱",
|
||||
"packagingWeight": { "value": 2.0, "unit": "kg", "source": "packageInfo" },
|
||||
"packagingDimensions": { "length": 40, "width": 30, "height": 20, "unit": "cm", "source": "packageInfo" },
|
||||
"unitsPerPackage": 50,
|
||||
"raw": { "包装方式": "纸箱", "箱规": "40*30*20cm", "装箱数": "50" }
|
||||
},
|
||||
"pieceWeightSize": {
|
||||
"weight": { "value": 0.5, "unit": "kg", "source": "pieceWeightSize" },
|
||||
"dimensions": { "length": 30, "width": 20, "height": 10, "unit": "cm", "source": "pieceWeightSize" },
|
||||
"volume": null,
|
||||
"raw": { "重量": "500g", "尺寸": "30*20*10cm" }
|
||||
}
|
||||
},
|
||||
"detailImages": ["/tmp/1688-logistics/852504650877/img_001.jpg"],
|
||||
"rawAttributes": { "重量": "0.5kg", "尺寸": "30*20*10cm" }
|
||||
|
|
@ -70,6 +85,6 @@ Weight/size data on 1688 pages hides in multiple places. Check all before giving
|
|||
|
||||
1. If the browser is not running, report the error. Do not try to launch it.
|
||||
2. Check all data sources before reporting `null`.
|
||||
3. Normalize units: 克→kg, 毫米→cm. Keep raw values in `rawAttributes`.
|
||||
3. Normalize units: 克→kg, 毫米→cm. Keep raw values in `rawAttributes` and `raw` fields.
|
||||
4. No retries. If it fails, report as-is.
|
||||
5. Trust page content. Do not guess values.
|
||||
|
|
|
|||
|
|
@ -12,17 +12,14 @@ Commands:
|
|||
Examples:
|
||||
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html'
|
||||
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dry-run
|
||||
bun scripts/run.ts --port=9223 scrape 'https://detail.1688.com/offer/852504650877.html'
|
||||
|
||||
Prerequisites:
|
||||
Chrome must be running with --remote-debugging-port=9222
|
||||
bun scripts/run.ts --port=18801 scrape 'https://detail.1688.com/offer/852504650877.html'
|
||||
`);
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const positionals: string[] = [];
|
||||
let dryRun = false;
|
||||
let port = 9222;
|
||||
let port = 18800;
|
||||
|
||||
for (const arg of process.argv.slice(2)) {
|
||||
if (arg === '--dry-run') {
|
||||
|
|
|
|||
150
src/index.ts
150
src/index.ts
|
|
@ -35,6 +35,21 @@ export interface VariantInfo {
|
|||
dimensions: Dimensions | null;
|
||||
}
|
||||
|
||||
export interface PackageInfo {
|
||||
packagingType: string | null;
|
||||
packagingWeight: LogisticsValue | null;
|
||||
packagingDimensions: Dimensions | null;
|
||||
unitsPerPackage: number | null;
|
||||
raw: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface PieceWeightSize {
|
||||
weight: LogisticsValue | null;
|
||||
dimensions: Dimensions | null;
|
||||
volume: LogisticsValue | null;
|
||||
raw: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface ScrapeResult {
|
||||
status: 'success' | 'failed';
|
||||
url: string;
|
||||
|
|
@ -44,6 +59,8 @@ export interface ScrapeResult {
|
|||
title: string;
|
||||
logistics: LogisticsData;
|
||||
variants: VariantInfo[];
|
||||
packageInfo: PackageInfo;
|
||||
pieceWeightSize: PieceWeightSize;
|
||||
};
|
||||
detailImages?: string[];
|
||||
rawAttributes?: Record<string, string>;
|
||||
|
|
@ -228,6 +245,79 @@ const JS_EXTRACT_IMAGES = `
|
|||
return JSON.stringify(imgs);
|
||||
})()`;
|
||||
|
||||
const JS_EXTRACT_PACKAGE_INFO = `
|
||||
(function() {
|
||||
const data = {};
|
||||
// 包装信息 section — various selector patterns on 1688
|
||||
const sels = [
|
||||
'[class*="package-info"] li',
|
||||
'[class*="packaging"] li',
|
||||
'[class*="pack-info"] li',
|
||||
'[class*="baozhuang"] li',
|
||||
'.detail-packing li',
|
||||
];
|
||||
for (const sel of sels) {
|
||||
document.querySelectorAll(sel).forEach(el => {
|
||||
const parts = el.textContent.trim().split(/[::]/);
|
||||
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
|
||||
});
|
||||
}
|
||||
// Also look for table rows inside 包装 sections
|
||||
document.querySelectorAll('table').forEach(table => {
|
||||
const header = table.previousElementSibling;
|
||||
if (header && /包装/.test(header.textContent || '')) {
|
||||
table.querySelectorAll('tr').forEach(tr => {
|
||||
const cells = tr.querySelectorAll('td, th');
|
||||
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
|
||||
});
|
||||
}
|
||||
});
|
||||
// Scan all key-value pairs for 包装 related keys
|
||||
document.querySelectorAll('[class*="attribute"] li, [class*="param"] li, .offer-attr-list .offer-attr-item').forEach(el => {
|
||||
const text = el.textContent.trim();
|
||||
if (/包装/.test(text)) {
|
||||
const parts = text.split(/[::]/);
|
||||
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
|
||||
}
|
||||
});
|
||||
return JSON.stringify(data);
|
||||
})()`;
|
||||
|
||||
const JS_EXTRACT_PIECE_WEIGHT_SIZE = `
|
||||
(function() {
|
||||
const data = {};
|
||||
// 商品件重尺 table — dedicated logistics spec table on 1688
|
||||
const sels = [
|
||||
'[class*="piece-weight"] tr',
|
||||
'[class*="jianzhongchi"] tr',
|
||||
'[class*="weight-size"] tr',
|
||||
'[class*="logistics-info"] tr',
|
||||
'[class*="freight-info"] tr',
|
||||
];
|
||||
for (const sel of sels) {
|
||||
document.querySelectorAll(sel).forEach(tr => {
|
||||
const cells = tr.querySelectorAll('td, th');
|
||||
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
|
||||
});
|
||||
}
|
||||
// Scan tables preceded by headers containing 件重尺
|
||||
document.querySelectorAll('table').forEach(table => {
|
||||
const header = table.previousElementSibling;
|
||||
if (header && /件重尺|物流|运费/.test(header.textContent || '')) {
|
||||
table.querySelectorAll('tr').forEach(tr => {
|
||||
const cells = tr.querySelectorAll('td, th');
|
||||
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
|
||||
});
|
||||
}
|
||||
});
|
||||
// Also check spans/divs in logistics area
|
||||
document.querySelectorAll('[class*="logistics"] [class*="item"], [class*="freight"] [class*="item"]').forEach(el => {
|
||||
const parts = el.textContent.trim().split(/[::]/);
|
||||
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
|
||||
});
|
||||
return JSON.stringify(data);
|
||||
})()`;
|
||||
|
||||
async function downloadImages(urls: string[], outputDir: string): Promise<string[]> {
|
||||
fs.mkdirSync(outputDir, { recursive: true });
|
||||
const saved: string[] = [];
|
||||
|
|
@ -251,7 +341,7 @@ export async function run(
|
|||
command: Command,
|
||||
args: string[],
|
||||
dryRun: boolean,
|
||||
cdpPort: number = 9222,
|
||||
cdpPort: number = 18800,
|
||||
): Promise<ScrapeResult> {
|
||||
if (command !== 'scrape') {
|
||||
return { status: 'failed', url: '', command, dryRun, error: `unknown command: ${command}` };
|
||||
|
|
@ -272,6 +362,8 @@ export async function run(
|
|||
packageWeight: null, volume: null, shippingMethod: null, shippingCost: null, origin: null,
|
||||
},
|
||||
variants: [],
|
||||
packageInfo: { packagingType: null, packagingWeight: null, packagingDimensions: null, unitsPerPackage: null, raw: {} },
|
||||
pieceWeightSize: { weight: null, dimensions: null, volume: null, raw: {} },
|
||||
},
|
||||
detailImages: [],
|
||||
rawAttributes: {},
|
||||
|
|
@ -334,13 +426,67 @@ export async function run(
|
|||
}
|
||||
}
|
||||
|
||||
// Extract 包装信息
|
||||
const rawPkgInfo: Record<string, string> = JSON.parse(await cdp.evaluate(JS_EXTRACT_PACKAGE_INFO) || '{}');
|
||||
const packageInfo: PackageInfo = {
|
||||
packagingType: null,
|
||||
packagingWeight: null,
|
||||
packagingDimensions: null,
|
||||
unitsPerPackage: null,
|
||||
raw: rawPkgInfo,
|
||||
};
|
||||
for (const [key, val] of Object.entries(rawPkgInfo)) {
|
||||
if (matchKey(key, ['包装方式', '包装类型', '包装形式'])) packageInfo.packagingType = val;
|
||||
if (matchKey(key, ['包装重量', '箱重'])) {
|
||||
packageInfo.packagingWeight = parseWeight(val);
|
||||
if (packageInfo.packagingWeight) packageInfo.packagingWeight.source = 'packageInfo';
|
||||
}
|
||||
if (matchKey(key, ['包装尺寸', '外箱尺寸', '箱规'])) {
|
||||
packageInfo.packagingDimensions = parseDimensions(val);
|
||||
if (packageInfo.packagingDimensions) packageInfo.packagingDimensions.source = 'packageInfo';
|
||||
}
|
||||
if (matchKey(key, ['装箱数', '每箱数量', '入数'])) {
|
||||
const n = parseInt(val, 10);
|
||||
if (!isNaN(n)) packageInfo.unitsPerPackage = n;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract 商品件重尺
|
||||
const rawPws: Record<string, string> = JSON.parse(await cdp.evaluate(JS_EXTRACT_PIECE_WEIGHT_SIZE) || '{}');
|
||||
const pieceWeightSize: PieceWeightSize = {
|
||||
weight: null,
|
||||
dimensions: null,
|
||||
volume: null,
|
||||
raw: rawPws,
|
||||
};
|
||||
for (const [key, val] of Object.entries(rawPws)) {
|
||||
if (matchKey(key, WEIGHT_KEYS)) {
|
||||
pieceWeightSize.weight = parseWeight(val);
|
||||
if (pieceWeightSize.weight) pieceWeightSize.weight.source = 'pieceWeightSize';
|
||||
}
|
||||
if (matchKey(key, DIMENSION_KEYS)) {
|
||||
pieceWeightSize.dimensions = parseDimensions(val);
|
||||
if (pieceWeightSize.dimensions) pieceWeightSize.dimensions.source = 'pieceWeightSize';
|
||||
}
|
||||
if (matchKey(key, VOLUME_KEYS)) {
|
||||
pieceWeightSize.volume = parseVolume(val);
|
||||
if (pieceWeightSize.volume) pieceWeightSize.volume.source = 'pieceWeightSize';
|
||||
}
|
||||
}
|
||||
|
||||
// Backfill logistics from pieceWeightSize if not found in attributes
|
||||
if (!logistics.weight && pieceWeightSize.weight) logistics.weight = pieceWeightSize.weight;
|
||||
if (!logistics.dimensions && pieceWeightSize.dimensions) logistics.dimensions = pieceWeightSize.dimensions;
|
||||
if (!logistics.volume && pieceWeightSize.volume) logistics.volume = pieceWeightSize.volume;
|
||||
if (!logistics.packageWeight && packageInfo.packagingWeight) logistics.packageWeight = packageInfo.packagingWeight;
|
||||
|
||||
const offerId = extractOfferId(url);
|
||||
const imgDir = path.join('/tmp', '1688-logistics', offerId);
|
||||
const detailImages = await downloadImages(imgUrls, imgDir);
|
||||
|
||||
return {
|
||||
status: 'success', url, command, dryRun,
|
||||
product: { title, logistics, variants },
|
||||
product: { title, logistics, variants, packageInfo, pieceWeightSize },
|
||||
detailImages,
|
||||
rawAttributes,
|
||||
};
|
||||
|
|
|
|||
Loading…
Reference in New Issue