Files
store_ai_extension/src/background/domScraper.ts
2026-05-07 09:38:05 +08:00

229 lines
6.5 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import type {PlatformFieldConfig} from '@/types';
/**
* 等待重试机制
*/
async function waitForElement(rootDom: ParentNode, selector: string) {
let retryCount = 5;
for (let i = 0; i < retryCount; i++) {
const element = rootDom.querySelector(selector);
if (element) {
return element;
}
if (i < retryCount) {
await sleep(500);
}
}
return null;
}
// 睡眠工具,给点击、翻页、异步渲染留出等待时间。
const sleep = (ms?: number) => new Promise((resolve) => window.setTimeout(resolve, ms ?? 1500));
/**
* 从元素中提取实际值,默认取文本,也支持 attr、图片 src、链接 href。
*/
function extractValue(el: Element | null, config: PlatformFieldConfig): string | null {
if (el == null) {
return "未找到"
}
if (config.attr) {
return (el.getAttribute(config.attr) || "").trim();
}
const tagName = el.tagName.toUpperCase();
if (tagName === 'IMG') {
return el.getAttribute('src');
}
if (tagName === 'A') {
const href = el.getAttribute('href');
return href && !href.startsWith('http') ? window.location.origin + href : href;
}
return (el.textContent || '').replace(/\n/g, '').trim();
}
/**
* 自动点击
* 根据字段 condition 配置在指定 DOM 范围内自动点击目标元素。
*/
async function autoClick(config: PlatformFieldConfig, rootDom: Element): Promise<void> {
if (!config.condition) {
return;
}
for (const condition of config.condition.list) {
let targets: HTMLElement[] = Array.from(rootDom.querySelectorAll(condition))
for (const target of targets) {
target.click();
await sleep(config?.condition.time);
}
}
}
/**
* 递归处理字段配置,支持普通字段、嵌套 row、列表和表格。
*/
export async function processFields(columns: PlatformFieldConfig[], rootDom: Element) {
const result = {} as any;
for (const item of columns) {
await autoClick(item, rootDom);
const element = await waitForElement(rootDom, item.className)
if (!element) {
result[item.label] = '没找到该元素';
continue;
}
if (!item.type) {
if (item.keys && item.keys.length > 0) {
await autoClick(item, element);
result[item.label] = await processFields(item.keys, element);
} else {
await autoClick(item, element);
result[item.label] = extractValue(element, item);
}
continue;
}
if (item.type === 1) {
result[item.label] = await processList(item, rootDom);
continue;
}
if (item.type === 2) {
result[item.label] = await processTable(item, element);
}
}
return result;
}
/**
* 提取列表的数据
* @param config 配置
* @param rootDom 父节点
*/
async function processList(config: PlatformFieldConfig, rootDom: ParentNode) {
const allList = [];
let pageCount = 0;
while (true) {
pageCount += 1;
const elements = Array.from(rootDom.querySelectorAll(config.className));
for (const element of elements) {
const itemData = await processFields(config.keys ?? [], element);
allList.push(itemData);
}
if (!config.pagination) {
console.log('未配置分页信息,抓取单页后结束。');
break;
}
if (config.pagination.maxPage && pageCount >= config.pagination.maxPage) {
console.log('已达到配置的最大页数,停止。');
break;
}
const nextBtn = document.querySelector<HTMLElement>(config.pagination.nextBtn);
if (!nextBtn) {
console.log('未找到下一页按钮,抓取结束。');
break;
}
const isDisabled = config.pagination.disabledClass
? nextBtn.classList.contains(config.pagination.disabledClass)
: nextBtn.hasAttribute('disabled');
if (isDisabled) {
console.log('下一页按钮已禁用,抓取结束。');
break;
}
nextBtn.click();
await sleep(config.pagination.delay);
}
return allList;
}
/**
* 按表格配置抓取表格行数据,并按分页配置继续翻页。
*/
async function processTable(config: PlatformFieldConfig, rootDom: ParentNode) {
const allTableData: any[] = [];
let pageCount = 0;
while (true) {
pageCount += 1;
const partsNodes: any = {};
config.tableParts!.forEach(part => {
partsNodes[part.name as any] = rootDom.querySelectorAll(`${part.select} tr`);
});
// //以第一个part的行数为准进行横向扫描
const rowCount = partsNodes[config.tableParts![0].name!]?.length || 0
for (let i = 0; i < rowCount; i++) {
const rowData: any = {};
//遍历keys根据part映射取对应的里面找
for (const keyItem of config.keys!) {
const targetRowNode = partsNodes[keyItem.part!][i];
if (targetRowNode) {
//提取值
if (keyItem.keys) {
rowData[keyItem.label] = await processFields(keyItem.keys, targetRowNode)
} else {
rowData[keyItem.label] = extractValue(targetRowNode.querySelector(keyItem.className), keyItem);
}
}
}
allTableData.push(rowData);
}
if (!config.pagination) {
console.log("未配置分页信息,抓取单页后结束。");
break;
}
if (config.pagination.maxPage && pageCount >= config.pagination.maxPage) {
console.log("已达到配置的最大页数,停止。");
break;
}
const nextBtn = document.querySelector<HTMLElement>(config.pagination.nextBtn);
if (!nextBtn) {
console.log('未找到下一页按钮,抓取结束。');
break;
}
const isDisabled = config.pagination.disabledClass
? nextBtn.classList.contains(config.pagination.disabledClass)
: nextBtn.hasAttribute('disabled');
if (isDisabled) {
console.log('下一页按钮已禁用,抓取结束。');
break;
}
nextBtn.click();
await sleep(config.pagination.delay);
}
return allTableData;
}