This commit is contained in:
zhu
2026-05-06 15:34:26 +08:00
parent 40df507300
commit 7e2a83efd6

View File

@@ -1,23 +1,4 @@
import type { PlatformFieldConfig } from '@/types';
// DOM 抓取后的通用结果结构。
export type DomScrapeResult = Record<string, unknown>;
/**
* 在目标网页上下文中执行 DOM 抓取。
*
* 注意:该方法会通过 chrome.scripting.executeScript 注入到页面中执行,
* 所以依赖的辅助方法都写在函数内部,避免注入后丢失模块作用域。
*/
export async function scrapeDomFields(fields: PlatformFieldConfig[]): Promise<DomScrapeResult | null> {
if (!document.body) {
return null;
}
return processFields(fields, document.body);
}
import type {PlatformFieldConfig} from '@/types';
// 睡眠工具,给点击、翻页、异步渲染留出等待时间。
const sleep = (ms?: number) => new Promise((resolve) => window.setTimeout(resolve, ms ?? 1500));
@@ -26,12 +7,11 @@ const sleep = (ms?: number) => new Promise((resolve) => window.setTimeout(resolv
* 从元素中提取实际值,默认取文本,也支持 attr、图片 src、链接 href。
*/
function extractValue(el: Element | null, config: PlatformFieldConfig): string | null {
if (!el) {
return null;
if (el == null) {
return "未找到"
}
if (config.attr) {
return (el.getAttribute(config.attr) || '').trim();
return (el.getAttribute(config.attr) || "").trim();
}
const tagName = el.tagName.toUpperCase();
@@ -49,19 +29,18 @@ function extractValue(el: Element | null, config: PlatformFieldConfig): string |
}
/**
* 自动点击
* 根据字段 condition 配置在指定 DOM 范围内自动点击目标元素。
*/
async function autoClick(config: PlatformFieldConfig, rootDom: ParentNode): Promise<void> {
async function autoClick(config: PlatformFieldConfig, rootDom: Element): Promise<void> {
if (!config.condition) {
return;
}
for (const selector of config.condition.list) {
const targets = Array.from(rootDom.querySelectorAll<HTMLElement>(selector));
for (const condition of config.condition.list) {
let targets: HTMLElement[] = Array.from(rootDom.querySelectorAll(condition))
for (const target of targets) {
target.click();
await sleep(config.condition.time);
await sleep(config?.condition.time);
}
}
}
@@ -69,8 +48,8 @@ async function autoClick(config: PlatformFieldConfig, rootDom: ParentNode): Prom
/**
* 递归处理字段配置,支持普通字段、嵌套 row、列表和表格。
*/
export async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode): Promise<DomScrapeResult> {
const result: DomScrapeResult = {};
export async function processFields(columns: PlatformFieldConfig[], rootDom: Element) {
const result = {} as any;
for (const item of columns) {
await autoClick(item, rootDom);
@@ -107,10 +86,12 @@ export async function processFields(columns: PlatformFieldConfig[], rootDom: Par
}
/**
* 按列表配置抓取所有列表项,并按分页配置继续翻页。
* 提取列表的数据
* @param config 配置
* @param rootDom 父节点
*/
async function processList(config: PlatformFieldConfig, rootDom: ParentNode): Promise<DomScrapeResult[]> {
const allList: DomScrapeResult[] = [];
async function processList(config: PlatformFieldConfig, rootDom: ParentNode) {
const allList = [];
let pageCount = 0;
while (true) {
@@ -159,54 +140,49 @@ async function processList(config: PlatformFieldConfig, rootDom: ParentNode): Pr
/**
* 按表格配置抓取表格行数据,并按分页配置继续翻页。
*/
async function processTable(config: PlatformFieldConfig, rootDom: ParentNode): Promise<DomScrapeResult[]> {
const allTableData: DomScrapeResult[] = [];
async function processTable(config: PlatformFieldConfig, rootDom: ParentNode) {
const allTableData: any[] = [];
let pageCount = 0;
while (true) {
pageCount += 1;
const partsNodes: Record<string, Element[]> = {};
const partsNodes: any = {};
for (const part of config.tableParts ?? []) {
const partKey = part.name ?? part.label;
const partSelector = part.select ?? part.className;
const rowSelector = part.rowSelector ?? `${partSelector} tr`;
partsNodes[partKey] = Array.from(rootDom.querySelectorAll(rowSelector));
}
config.tableParts!.forEach(part => {
partsNodes[part.name as any] = rootDom.querySelectorAll(`${part.select} tr`);
});
const firstPart = config.tableParts?.[0];
const firstPartKey = firstPart ? firstPart.name ?? firstPart.label : '';
const rowCount = partsNodes[firstPartKey]?.length || 0;
// //以第一个part的行数为准进行横向扫描
const rowCount = partsNodes[config.tableParts![0].name!]?.length || 0
for (let index = 0; index < rowCount; index += 1) {
const rowData: DomScrapeResult = {};
for (const keyItem of config.keys ?? []) {
const partKey = keyItem.part ?? firstPartKey;
const targetRowNode = partsNodes[partKey]?.[index];
for (let i = 0; i < rowCount; i++) {
const rowData: any = {};
if (!targetRowNode) {
continue;
}
//遍历keys根据part映射取对应的里面找
for (const keyItem of config.keys!) {
const targetRowNode = partsNodes[keyItem.part!][i];
if (keyItem.keys) {
rowData[keyItem.label] = await processFields(keyItem.keys, targetRowNode);
} else {
rowData[keyItem.label] = extractValue(targetRowNode.querySelector(keyItem.className), keyItem);
if (targetRowNode) {
//提取值
if (keyItem.keys) {
rowData[keyItem.label] = await processFields(keyItem.keys, targetRowNode)
} else {
rowData[keyItem.label] = extractValue(targetRowNode.querySelector(keyItem.className), keyItem);
}
}
}
allTableData.push(rowData);
}
if (!config.pagination) {
console.log('未配置分页信息,抓取单页后结束。');
console.log("未配置分页信息,抓取单页后结束。");
break;
}
if (config.pagination.maxPage && pageCount >= config.pagination.maxPage) {
console.log('已达到配置的最大页数,停止。');
console.log("已达到配置的最大页数,停止。");
break;
}
@@ -231,4 +207,4 @@ async function processTable(config: PlatformFieldConfig, rootDom: ParentNode): P
}
return allTableData;
}
}