This commit is contained in:
zhu
2026-04-30 11:23:31 +08:00
parent 7ca9dabaf9
commit 4c0a1d8151
4 changed files with 461 additions and 2 deletions

View File

@@ -0,0 +1,226 @@
import type { PlatformFieldConfig } from '@/types';
/** DOM 抓取后的通用结果结构。 */
export type DomScrapeResult = Record<string, unknown>;
/** 默认等待时间,用于点击后或翻页后等待页面渲染。 */
const DEFAULT_DELAY = 1500;
/**
* 在目标网页上下文中执行 DOM 抓取。
*
* 注意:该方法会通过 chrome.scripting.executeScript 注入到页面中执行,
* 所以依赖的辅助方法都写在函数内部,避免注入后丢失模块作用域。
*/
export async function scrapeDomFields(fields: PlatformFieldConfig[]): Promise<DomScrapeResult | null> {
if (!document.body) {
return null;
}
return processFields(fields, document.body);
}
/** 睡眠工具,给点击、翻页、异步渲染留出等待时间。 */
const sleep = (ms?: number) => new Promise((resolve) => window.setTimeout(resolve, ms ?? DEFAULT_DELAY));
/** 从元素中提取实际值,默认取文本,也支持 attr、图片 src、链接 href。 */
function extractValue(el: Element | null, config: PlatformFieldConfig): string | null {
if (!el) {
return null;
}
if (config.attr) {
return (el.getAttribute(config.attr) || '').trim();
}
const tagName = el.tagName.toUpperCase();
if (tagName === 'IMG') {
return el.getAttribute('src');
}
if (tagName === 'A') {
const href = el.getAttribute('href');
return href && !href.startsWith('http') ? window.location.origin + href : href;
}
return (el.textContent || '').replace(/\n/g, '').trim();
}
/** 根据字段 condition 配置在指定 DOM 范围内自动点击目标元素。 */
async function autoClick(config: PlatformFieldConfig, rootDom: ParentNode): Promise<void> {
if (!config.condition) {
return;
}
for (const selector of config.condition.list) {
const targets = Array.from(rootDom.querySelectorAll<HTMLElement>(selector));
for (const target of targets) {
target.click();
await sleep(config.condition.time);
}
}
}
/** 递归处理字段配置,支持普通字段、嵌套 row、列表和表格。 */
async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode): Promise<DomScrapeResult> {
const result: DomScrapeResult = {};
for (const item of columns) {
await autoClick(item, rootDom);
const element = rootDom.querySelector(item.className);
if (!element) {
result[item.label] = '没找到该元素';
continue;
}
if (!item.type) {
if (item.keys && item.keys.length > 0) {
await autoClick(item, element);
result[item.label] = await processFields(item.keys, element);
} else {
await autoClick(item, element);
result[item.label] = extractValue(element, item);
}
continue;
}
if (item.type === 1) {
result[item.label] = await processList(item, rootDom);
continue;
}
if (item.type === 2) {
result[item.label] = await processTable(item, element);
}
}
return result;
}
/** 按列表配置抓取所有列表项,并按分页配置继续翻页。 */
async function processList(config: PlatformFieldConfig, rootDom: ParentNode): Promise<DomScrapeResult[]> {
const allList: DomScrapeResult[] = [];
let pageCount = 0;
while (true) {
pageCount += 1;
const elements = Array.from(rootDom.querySelectorAll(config.className));
for (const element of elements) {
const itemData = await processFields(config.keys ?? [], element);
allList.push(itemData);
}
if (!config.pagination) {
console.log('未配置分页信息,抓取单页后结束。');
break;
}
if (config.pagination.maxPage && pageCount >= config.pagination.maxPage) {
console.log('已达到配置的最大页数,停止。');
break;
}
const nextBtn = document.querySelector<HTMLElement>(config.pagination.nextBtn);
if (!nextBtn) {
console.log('未找到下一页按钮,抓取结束。');
break;
}
const isDisabled = config.pagination.disabledClass
? nextBtn.classList.contains(config.pagination.disabledClass)
: nextBtn.hasAttribute('disabled');
if (isDisabled) {
console.log('下一页按钮已禁用,抓取结束。');
break;
}
nextBtn.click();
await sleep(config.pagination.delay);
}
return allList;
}
/** 按表格配置抓取表格行数据,并按分页配置继续翻页。 */
async function processTable(config: PlatformFieldConfig, rootDom: ParentNode): Promise<DomScrapeResult[]> {
const allTableData: DomScrapeResult[] = [];
let pageCount = 0;
while (true) {
pageCount += 1;
const partsNodes: Record<string, Element[]> = {};
for (const part of config.tableParts ?? []) {
const partKey = part.name ?? part.label;
const partSelector = part.select ?? part.className;
const rowSelector = part.rowSelector ?? `${partSelector} tr`;
partsNodes[partKey] = Array.from(rootDom.querySelectorAll(rowSelector));
}
const firstPart = config.tableParts?.[0];
const firstPartKey = firstPart ? firstPart.name ?? firstPart.label : '';
const rowCount = partsNodes[firstPartKey]?.length || 0;
for (let index = 0; index < rowCount; index += 1) {
const rowData: DomScrapeResult = {};
for (const keyItem of config.keys ?? []) {
const partKey = keyItem.part ?? firstPartKey;
const targetRowNode = partsNodes[partKey]?.[index];
if (!targetRowNode) {
continue;
}
if (keyItem.keys) {
rowData[keyItem.label] = await processFields(keyItem.keys, targetRowNode);
} else {
rowData[keyItem.label] = extractValue(targetRowNode.querySelector(keyItem.className), keyItem);
}
}
allTableData.push(rowData);
}
if (!config.pagination) {
console.log('未配置分页信息,抓取单页后结束。');
break;
}
if (config.pagination.maxPage && pageCount >= config.pagination.maxPage) {
console.log('已达到配置的最大页数,停止。');
break;
}
const nextBtn = document.querySelector<HTMLElement>(config.pagination.nextBtn);
if (!nextBtn) {
console.log('未找到下一页按钮,抓取结束。');
break;
}
const isDisabled = config.pagination.disabledClass
? nextBtn.classList.contains(config.pagination.disabledClass)
: nextBtn.hasAttribute('disabled');
if (isDisabled) {
console.log('下一页按钮已禁用,抓取结束。');
break;
}
nextBtn.click();
await sleep(config.pagination.delay);
}
return allTableData;
}