229 lines
6.5 KiB
TypeScript
229 lines
6.5 KiB
TypeScript
import type {PlatformFieldConfig} from '@/types';
|
||
|
||
/**
|
||
* 等待重试机制
|
||
*/
|
||
async function waitForElement(rootDom: ParentNode, selector: string) {
|
||
let retryCount = 5;
|
||
for (let i = 0; i < retryCount; i++) {
|
||
const element = rootDom.querySelector(selector);
|
||
if (element) {
|
||
return element;
|
||
}
|
||
if (i < retryCount) {
|
||
await sleep(500);
|
||
}
|
||
}
|
||
return null;
|
||
}
|
||
|
||
|
||
// 睡眠工具,给点击、翻页、异步渲染留出等待时间。
|
||
const sleep = (ms?: number) => new Promise((resolve) => window.setTimeout(resolve, ms ?? 1500));
|
||
|
||
/**
|
||
* 从元素中提取实际值,默认取文本,也支持 attr、图片 src、链接 href。
|
||
*/
|
||
function extractValue(el: Element | null, config: PlatformFieldConfig): string | null {
|
||
if (el == null) {
|
||
return "未找到"
|
||
}
|
||
if (config.attr) {
|
||
return (el.getAttribute(config.attr) || "").trim();
|
||
}
|
||
|
||
const tagName = el.tagName.toUpperCase();
|
||
|
||
if (tagName === 'IMG') {
|
||
return el.getAttribute('src');
|
||
}
|
||
|
||
if (tagName === 'A') {
|
||
const href = el.getAttribute('href');
|
||
return href && !href.startsWith('http') ? window.location.origin + href : href;
|
||
}
|
||
|
||
return (el.textContent || '').replace(/\n/g, '').trim();
|
||
}
|
||
|
||
/**
|
||
* 自动点击
|
||
* 根据字段 condition 配置在指定 DOM 范围内自动点击目标元素。
|
||
*/
|
||
async function autoClick(config: PlatformFieldConfig, rootDom: Element): Promise<void> {
|
||
if (!config.condition) {
|
||
return;
|
||
}
|
||
for (const condition of config.condition.list) {
|
||
let targets: HTMLElement[] = Array.from(rootDom.querySelectorAll(condition))
|
||
for (const target of targets) {
|
||
target.click();
|
||
await sleep(config?.condition.time);
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 递归处理字段配置,支持普通字段、嵌套 row、列表和表格。
|
||
*/
|
||
export async function processFields(columns: PlatformFieldConfig[], rootDom: Element) {
|
||
const result = {} as any;
|
||
|
||
for (const item of columns) {
|
||
await autoClick(item, rootDom);
|
||
|
||
const element = await waitForElement(rootDom, item.className)
|
||
|
||
if (!element) {
|
||
result[item.label] = '没找到该元素';
|
||
continue;
|
||
}
|
||
|
||
if (!item.type) {
|
||
if (item.keys && item.keys.length > 0) {
|
||
await autoClick(item, element);
|
||
result[item.label] = await processFields(item.keys, element);
|
||
} else {
|
||
await autoClick(item, element);
|
||
result[item.label] = extractValue(element, item);
|
||
}
|
||
continue;
|
||
}
|
||
|
||
if (item.type === 1) {
|
||
result[item.label] = await processList(item, rootDom);
|
||
continue;
|
||
}
|
||
|
||
if (item.type === 2) {
|
||
result[item.label] = await processTable(item, element);
|
||
}
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
/**
|
||
* 提取列表的数据
|
||
* @param config 配置
|
||
* @param rootDom 父节点
|
||
*/
|
||
async function processList(config: PlatformFieldConfig, rootDom: ParentNode) {
|
||
const allList = [];
|
||
let pageCount = 0;
|
||
|
||
while (true) {
|
||
pageCount += 1;
|
||
|
||
const elements = Array.from(rootDom.querySelectorAll(config.className));
|
||
|
||
for (const element of elements) {
|
||
const itemData = await processFields(config.keys ?? [], element);
|
||
allList.push(itemData);
|
||
}
|
||
|
||
if (!config.pagination) {
|
||
console.log('未配置分页信息,抓取单页后结束。');
|
||
break;
|
||
}
|
||
|
||
if (config.pagination.maxPage && pageCount >= config.pagination.maxPage) {
|
||
console.log('已达到配置的最大页数,停止。');
|
||
break;
|
||
}
|
||
|
||
const nextBtn = document.querySelector<HTMLElement>(config.pagination.nextBtn);
|
||
|
||
if (!nextBtn) {
|
||
console.log('未找到下一页按钮,抓取结束。');
|
||
break;
|
||
}
|
||
|
||
const isDisabled = config.pagination.disabledClass
|
||
? nextBtn.classList.contains(config.pagination.disabledClass)
|
||
: nextBtn.hasAttribute('disabled');
|
||
|
||
if (isDisabled) {
|
||
console.log('下一页按钮已禁用,抓取结束。');
|
||
break;
|
||
}
|
||
|
||
nextBtn.click();
|
||
await sleep(config.pagination.delay);
|
||
}
|
||
|
||
return allList;
|
||
}
|
||
|
||
/**
|
||
* 按表格配置抓取表格行数据,并按分页配置继续翻页。
|
||
*/
|
||
async function processTable(config: PlatformFieldConfig, rootDom: ParentNode) {
|
||
const allTableData: any[] = [];
|
||
let pageCount = 0;
|
||
|
||
while (true) {
|
||
pageCount += 1;
|
||
|
||
const partsNodes: any = {};
|
||
|
||
config.tableParts!.forEach(part => {
|
||
partsNodes[part.name as any] = rootDom.querySelectorAll(`${part.select} tr`);
|
||
});
|
||
|
||
// //以第一个part的行数为准,进行横向扫描
|
||
const rowCount = partsNodes[config.tableParts![0].name!]?.length || 0
|
||
|
||
|
||
for (let i = 0; i < rowCount; i++) {
|
||
const rowData: any = {};
|
||
|
||
//遍历keys,根据part映射,取对应的里面找
|
||
for (const keyItem of config.keys!) {
|
||
const targetRowNode = partsNodes[keyItem.part!][i];
|
||
|
||
if (targetRowNode) {
|
||
//提取值
|
||
if (keyItem.keys) {
|
||
rowData[keyItem.label] = await processFields(keyItem.keys, targetRowNode)
|
||
} else {
|
||
rowData[keyItem.label] = extractValue(targetRowNode.querySelector(keyItem.className), keyItem);
|
||
}
|
||
}
|
||
}
|
||
allTableData.push(rowData);
|
||
}
|
||
|
||
if (!config.pagination) {
|
||
console.log("未配置分页信息,抓取单页后结束。");
|
||
break;
|
||
}
|
||
|
||
if (config.pagination.maxPage && pageCount >= config.pagination.maxPage) {
|
||
console.log("已达到配置的最大页数,停止。");
|
||
break;
|
||
}
|
||
|
||
const nextBtn = document.querySelector<HTMLElement>(config.pagination.nextBtn);
|
||
|
||
if (!nextBtn) {
|
||
console.log('未找到下一页按钮,抓取结束。');
|
||
break;
|
||
}
|
||
|
||
const isDisabled = config.pagination.disabledClass
|
||
? nextBtn.classList.contains(config.pagination.disabledClass)
|
||
: nextBtn.hasAttribute('disabled');
|
||
|
||
if (isDisabled) {
|
||
console.log('下一页按钮已禁用,抓取结束。');
|
||
break;
|
||
}
|
||
|
||
nextBtn.click();
|
||
await sleep(config.pagination.delay);
|
||
}
|
||
|
||
return allTableData;
|
||
}
|