feat: default CDP port 18800, add 包装信息 and 商品件重尺 extraction
register-skill-release / register (push) Successful in 14s
Details
register-skill-release / register (push) Successful in 14s
Details
- Change default CDP port from 9222 to 18800 - Extract 包装信息 section (packaging type, box weight/dims, units per box) - Extract 商品件重尺 table (per-piece weight/dimensions/volume) - Backfill logistics from pieceWeightSize/packageInfo when attributes missing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
9d147242e0
commit
e48592690b
31
SKILL.md
31
SKILL.md
|
|
@ -22,8 +22,8 @@ bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dr
|
||||||
|
|
||||||
## What It Does
|
## What It Does
|
||||||
|
|
||||||
1. Opens the 1688 product URL in the browser
|
1. Opens the 1688 product URL in the browser (port 18800)
|
||||||
2. Extracts weight/size data from wherever it appears on the page — product attributes, variant specs, logistics section
|
2. Extracts weight/size data from wherever it appears on the page — product attributes, variant specs, 包装信息, 商品件重尺 table, logistics section
|
||||||
3. Downloads detail images (商品详情图片) for analysis — weight/size is often only in images
|
3. Downloads detail images (商品详情图片) for analysis — weight/size is often only in images
|
||||||
4. Outputs structured JSON
|
4. Outputs structured JSON
|
||||||
|
|
||||||
|
|
@ -32,9 +32,11 @@ bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dr
|
||||||
Weight/size data on 1688 pages hides in multiple places. Check all before giving up:
|
Weight/size data on 1688 pages hides in multiple places. Check all before giving up:
|
||||||
|
|
||||||
1. **Product attributes** (商品属性 / 商品参数) — key-value table, most reliable
|
1. **Product attributes** (商品属性 / 商品参数) — key-value table, most reliable
|
||||||
2. **Variant/SKU specs** — per-variant weight or size
|
2. **商品件重尺 table** — dedicated weight/dimensions/volume table for logistics
|
||||||
3. **Logistics section** — shipping weight, volume, freight info
|
3. **包装信息 section** — packaging type, box weight, box dimensions, units per box
|
||||||
4. **Detail images** — downloaded to `/tmp/1688-logistics/<offer-id>/`, read them to find weight/size text baked into images
|
4. **Variant/SKU specs** — per-variant weight or size
|
||||||
|
5. **Logistics section** — shipping weight, volume, freight info
|
||||||
|
6. **Detail images** — downloaded to `/tmp/1688-logistics/<offer-id>/`, read them to find weight/size text baked into images
|
||||||
|
|
||||||
## Output
|
## Output
|
||||||
|
|
||||||
|
|
@ -49,7 +51,7 @@ Weight/size data on 1688 pages hides in multiple places. Check all before giving
|
||||||
"dimensions": { "length": 30, "width": 20, "height": 10, "unit": "cm", "source": "attributes" },
|
"dimensions": { "length": 30, "width": 20, "height": 10, "unit": "cm", "source": "attributes" },
|
||||||
"grossWeight": null,
|
"grossWeight": null,
|
||||||
"netWeight": null,
|
"netWeight": null,
|
||||||
"packageWeight": null,
|
"packageWeight": { "value": 2.0, "unit": "kg", "source": "packageInfo" },
|
||||||
"volume": null,
|
"volume": null,
|
||||||
"shippingMethod": null,
|
"shippingMethod": null,
|
||||||
"shippingCost": null,
|
"shippingCost": null,
|
||||||
|
|
@ -57,7 +59,20 @@ Weight/size data on 1688 pages hides in multiple places. Check all before giving
|
||||||
},
|
},
|
||||||
"variants": [
|
"variants": [
|
||||||
{ "name": "颜色: 红色", "weight": null, "dimensions": null }
|
{ "name": "颜色: 红色", "weight": null, "dimensions": null }
|
||||||
]
|
],
|
||||||
|
"packageInfo": {
|
||||||
|
"packagingType": "纸箱",
|
||||||
|
"packagingWeight": { "value": 2.0, "unit": "kg", "source": "packageInfo" },
|
||||||
|
"packagingDimensions": { "length": 40, "width": 30, "height": 20, "unit": "cm", "source": "packageInfo" },
|
||||||
|
"unitsPerPackage": 50,
|
||||||
|
"raw": { "包装方式": "纸箱", "箱规": "40*30*20cm", "装箱数": "50" }
|
||||||
|
},
|
||||||
|
"pieceWeightSize": {
|
||||||
|
"weight": { "value": 0.5, "unit": "kg", "source": "pieceWeightSize" },
|
||||||
|
"dimensions": { "length": 30, "width": 20, "height": 10, "unit": "cm", "source": "pieceWeightSize" },
|
||||||
|
"volume": null,
|
||||||
|
"raw": { "重量": "500g", "尺寸": "30*20*10cm" }
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"detailImages": ["/tmp/1688-logistics/852504650877/img_001.jpg"],
|
"detailImages": ["/tmp/1688-logistics/852504650877/img_001.jpg"],
|
||||||
"rawAttributes": { "重量": "0.5kg", "尺寸": "30*20*10cm" }
|
"rawAttributes": { "重量": "0.5kg", "尺寸": "30*20*10cm" }
|
||||||
|
|
@ -70,6 +85,6 @@ Weight/size data on 1688 pages hides in multiple places. Check all before giving
|
||||||
|
|
||||||
1. If the browser is not running, report the error. Do not try to launch it.
|
1. If the browser is not running, report the error. Do not try to launch it.
|
||||||
2. Check all data sources before reporting `null`.
|
2. Check all data sources before reporting `null`.
|
||||||
3. Normalize units: 克→kg, 毫米→cm. Keep raw values in `rawAttributes`.
|
3. Normalize units: 克→kg, 毫米→cm. Keep raw values in `rawAttributes` and `raw` fields.
|
||||||
4. No retries. If it fails, report as-is.
|
4. No retries. If it fails, report as-is.
|
||||||
5. Trust page content. Do not guess values.
|
5. Trust page content. Do not guess values.
|
||||||
|
|
|
||||||
|
|
@ -12,17 +12,14 @@ Commands:
|
||||||
Examples:
|
Examples:
|
||||||
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html'
|
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html'
|
||||||
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dry-run
|
bun scripts/run.ts scrape 'https://detail.1688.com/offer/852504650877.html' --dry-run
|
||||||
bun scripts/run.ts --port=9223 scrape 'https://detail.1688.com/offer/852504650877.html'
|
bun scripts/run.ts --port=18801 scrape 'https://detail.1688.com/offer/852504650877.html'
|
||||||
|
|
||||||
Prerequisites:
|
|
||||||
Chrome must be running with --remote-debugging-port=9222
|
|
||||||
`);
|
`);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function main(): Promise<void> {
|
async function main(): Promise<void> {
|
||||||
const positionals: string[] = [];
|
const positionals: string[] = [];
|
||||||
let dryRun = false;
|
let dryRun = false;
|
||||||
let port = 9222;
|
let port = 18800;
|
||||||
|
|
||||||
for (const arg of process.argv.slice(2)) {
|
for (const arg of process.argv.slice(2)) {
|
||||||
if (arg === '--dry-run') {
|
if (arg === '--dry-run') {
|
||||||
|
|
|
||||||
150
src/index.ts
150
src/index.ts
|
|
@ -35,6 +35,21 @@ export interface VariantInfo {
|
||||||
dimensions: Dimensions | null;
|
dimensions: Dimensions | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface PackageInfo {
|
||||||
|
packagingType: string | null;
|
||||||
|
packagingWeight: LogisticsValue | null;
|
||||||
|
packagingDimensions: Dimensions | null;
|
||||||
|
unitsPerPackage: number | null;
|
||||||
|
raw: Record<string, string>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PieceWeightSize {
|
||||||
|
weight: LogisticsValue | null;
|
||||||
|
dimensions: Dimensions | null;
|
||||||
|
volume: LogisticsValue | null;
|
||||||
|
raw: Record<string, string>;
|
||||||
|
}
|
||||||
|
|
||||||
export interface ScrapeResult {
|
export interface ScrapeResult {
|
||||||
status: 'success' | 'failed';
|
status: 'success' | 'failed';
|
||||||
url: string;
|
url: string;
|
||||||
|
|
@ -44,6 +59,8 @@ export interface ScrapeResult {
|
||||||
title: string;
|
title: string;
|
||||||
logistics: LogisticsData;
|
logistics: LogisticsData;
|
||||||
variants: VariantInfo[];
|
variants: VariantInfo[];
|
||||||
|
packageInfo: PackageInfo;
|
||||||
|
pieceWeightSize: PieceWeightSize;
|
||||||
};
|
};
|
||||||
detailImages?: string[];
|
detailImages?: string[];
|
||||||
rawAttributes?: Record<string, string>;
|
rawAttributes?: Record<string, string>;
|
||||||
|
|
@ -228,6 +245,79 @@ const JS_EXTRACT_IMAGES = `
|
||||||
return JSON.stringify(imgs);
|
return JSON.stringify(imgs);
|
||||||
})()`;
|
})()`;
|
||||||
|
|
||||||
|
const JS_EXTRACT_PACKAGE_INFO = `
|
||||||
|
(function() {
|
||||||
|
const data = {};
|
||||||
|
// 包装信息 section — various selector patterns on 1688
|
||||||
|
const sels = [
|
||||||
|
'[class*="package-info"] li',
|
||||||
|
'[class*="packaging"] li',
|
||||||
|
'[class*="pack-info"] li',
|
||||||
|
'[class*="baozhuang"] li',
|
||||||
|
'.detail-packing li',
|
||||||
|
];
|
||||||
|
for (const sel of sels) {
|
||||||
|
document.querySelectorAll(sel).forEach(el => {
|
||||||
|
const parts = el.textContent.trim().split(/[::]/);
|
||||||
|
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// Also look for table rows inside 包装 sections
|
||||||
|
document.querySelectorAll('table').forEach(table => {
|
||||||
|
const header = table.previousElementSibling;
|
||||||
|
if (header && /包装/.test(header.textContent || '')) {
|
||||||
|
table.querySelectorAll('tr').forEach(tr => {
|
||||||
|
const cells = tr.querySelectorAll('td, th');
|
||||||
|
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
// Scan all key-value pairs for 包装 related keys
|
||||||
|
document.querySelectorAll('[class*="attribute"] li, [class*="param"] li, .offer-attr-list .offer-attr-item').forEach(el => {
|
||||||
|
const text = el.textContent.trim();
|
||||||
|
if (/包装/.test(text)) {
|
||||||
|
const parts = text.split(/[::]/);
|
||||||
|
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return JSON.stringify(data);
|
||||||
|
})()`;
|
||||||
|
|
||||||
|
const JS_EXTRACT_PIECE_WEIGHT_SIZE = `
|
||||||
|
(function() {
|
||||||
|
const data = {};
|
||||||
|
// 商品件重尺 table — dedicated logistics spec table on 1688
|
||||||
|
const sels = [
|
||||||
|
'[class*="piece-weight"] tr',
|
||||||
|
'[class*="jianzhongchi"] tr',
|
||||||
|
'[class*="weight-size"] tr',
|
||||||
|
'[class*="logistics-info"] tr',
|
||||||
|
'[class*="freight-info"] tr',
|
||||||
|
];
|
||||||
|
for (const sel of sels) {
|
||||||
|
document.querySelectorAll(sel).forEach(tr => {
|
||||||
|
const cells = tr.querySelectorAll('td, th');
|
||||||
|
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// Scan tables preceded by headers containing 件重尺
|
||||||
|
document.querySelectorAll('table').forEach(table => {
|
||||||
|
const header = table.previousElementSibling;
|
||||||
|
if (header && /件重尺|物流|运费/.test(header.textContent || '')) {
|
||||||
|
table.querySelectorAll('tr').forEach(tr => {
|
||||||
|
const cells = tr.querySelectorAll('td, th');
|
||||||
|
if (cells.length >= 2) data[cells[0].textContent.trim()] = cells[1].textContent.trim();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
// Also check spans/divs in logistics area
|
||||||
|
document.querySelectorAll('[class*="logistics"] [class*="item"], [class*="freight"] [class*="item"]').forEach(el => {
|
||||||
|
const parts = el.textContent.trim().split(/[::]/);
|
||||||
|
if (parts.length >= 2) data[parts[0].trim()] = parts.slice(1).join(':').trim();
|
||||||
|
});
|
||||||
|
return JSON.stringify(data);
|
||||||
|
})()`;
|
||||||
|
|
||||||
async function downloadImages(urls: string[], outputDir: string): Promise<string[]> {
|
async function downloadImages(urls: string[], outputDir: string): Promise<string[]> {
|
||||||
fs.mkdirSync(outputDir, { recursive: true });
|
fs.mkdirSync(outputDir, { recursive: true });
|
||||||
const saved: string[] = [];
|
const saved: string[] = [];
|
||||||
|
|
@ -251,7 +341,7 @@ export async function run(
|
||||||
command: Command,
|
command: Command,
|
||||||
args: string[],
|
args: string[],
|
||||||
dryRun: boolean,
|
dryRun: boolean,
|
||||||
cdpPort: number = 9222,
|
cdpPort: number = 18800,
|
||||||
): Promise<ScrapeResult> {
|
): Promise<ScrapeResult> {
|
||||||
if (command !== 'scrape') {
|
if (command !== 'scrape') {
|
||||||
return { status: 'failed', url: '', command, dryRun, error: `unknown command: ${command}` };
|
return { status: 'failed', url: '', command, dryRun, error: `unknown command: ${command}` };
|
||||||
|
|
@ -272,6 +362,8 @@ export async function run(
|
||||||
packageWeight: null, volume: null, shippingMethod: null, shippingCost: null, origin: null,
|
packageWeight: null, volume: null, shippingMethod: null, shippingCost: null, origin: null,
|
||||||
},
|
},
|
||||||
variants: [],
|
variants: [],
|
||||||
|
packageInfo: { packagingType: null, packagingWeight: null, packagingDimensions: null, unitsPerPackage: null, raw: {} },
|
||||||
|
pieceWeightSize: { weight: null, dimensions: null, volume: null, raw: {} },
|
||||||
},
|
},
|
||||||
detailImages: [],
|
detailImages: [],
|
||||||
rawAttributes: {},
|
rawAttributes: {},
|
||||||
|
|
@ -334,13 +426,67 @@ export async function run(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Extract 包装信息
|
||||||
|
const rawPkgInfo: Record<string, string> = JSON.parse(await cdp.evaluate(JS_EXTRACT_PACKAGE_INFO) || '{}');
|
||||||
|
const packageInfo: PackageInfo = {
|
||||||
|
packagingType: null,
|
||||||
|
packagingWeight: null,
|
||||||
|
packagingDimensions: null,
|
||||||
|
unitsPerPackage: null,
|
||||||
|
raw: rawPkgInfo,
|
||||||
|
};
|
||||||
|
for (const [key, val] of Object.entries(rawPkgInfo)) {
|
||||||
|
if (matchKey(key, ['包装方式', '包装类型', '包装形式'])) packageInfo.packagingType = val;
|
||||||
|
if (matchKey(key, ['包装重量', '箱重'])) {
|
||||||
|
packageInfo.packagingWeight = parseWeight(val);
|
||||||
|
if (packageInfo.packagingWeight) packageInfo.packagingWeight.source = 'packageInfo';
|
||||||
|
}
|
||||||
|
if (matchKey(key, ['包装尺寸', '外箱尺寸', '箱规'])) {
|
||||||
|
packageInfo.packagingDimensions = parseDimensions(val);
|
||||||
|
if (packageInfo.packagingDimensions) packageInfo.packagingDimensions.source = 'packageInfo';
|
||||||
|
}
|
||||||
|
if (matchKey(key, ['装箱数', '每箱数量', '入数'])) {
|
||||||
|
const n = parseInt(val, 10);
|
||||||
|
if (!isNaN(n)) packageInfo.unitsPerPackage = n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract 商品件重尺
|
||||||
|
const rawPws: Record<string, string> = JSON.parse(await cdp.evaluate(JS_EXTRACT_PIECE_WEIGHT_SIZE) || '{}');
|
||||||
|
const pieceWeightSize: PieceWeightSize = {
|
||||||
|
weight: null,
|
||||||
|
dimensions: null,
|
||||||
|
volume: null,
|
||||||
|
raw: rawPws,
|
||||||
|
};
|
||||||
|
for (const [key, val] of Object.entries(rawPws)) {
|
||||||
|
if (matchKey(key, WEIGHT_KEYS)) {
|
||||||
|
pieceWeightSize.weight = parseWeight(val);
|
||||||
|
if (pieceWeightSize.weight) pieceWeightSize.weight.source = 'pieceWeightSize';
|
||||||
|
}
|
||||||
|
if (matchKey(key, DIMENSION_KEYS)) {
|
||||||
|
pieceWeightSize.dimensions = parseDimensions(val);
|
||||||
|
if (pieceWeightSize.dimensions) pieceWeightSize.dimensions.source = 'pieceWeightSize';
|
||||||
|
}
|
||||||
|
if (matchKey(key, VOLUME_KEYS)) {
|
||||||
|
pieceWeightSize.volume = parseVolume(val);
|
||||||
|
if (pieceWeightSize.volume) pieceWeightSize.volume.source = 'pieceWeightSize';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Backfill logistics from pieceWeightSize if not found in attributes
|
||||||
|
if (!logistics.weight && pieceWeightSize.weight) logistics.weight = pieceWeightSize.weight;
|
||||||
|
if (!logistics.dimensions && pieceWeightSize.dimensions) logistics.dimensions = pieceWeightSize.dimensions;
|
||||||
|
if (!logistics.volume && pieceWeightSize.volume) logistics.volume = pieceWeightSize.volume;
|
||||||
|
if (!logistics.packageWeight && packageInfo.packagingWeight) logistics.packageWeight = packageInfo.packagingWeight;
|
||||||
|
|
||||||
const offerId = extractOfferId(url);
|
const offerId = extractOfferId(url);
|
||||||
const imgDir = path.join('/tmp', '1688-logistics', offerId);
|
const imgDir = path.join('/tmp', '1688-logistics', offerId);
|
||||||
const detailImages = await downloadImages(imgUrls, imgDir);
|
const detailImages = await downloadImages(imgUrls, imgDir);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
status: 'success', url, command, dryRun,
|
status: 'success', url, command, dryRun,
|
||||||
product: { title, logistics, variants },
|
product: { title, logistics, variants, packageInfo, pieceWeightSize },
|
||||||
detailImages,
|
detailImages,
|
||||||
rawAttributes,
|
rawAttributes,
|
||||||
};
|
};
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue