1
This commit is contained in:
@@ -1,23 +1,4 @@
|
||||
import type { PlatformFieldConfig } from '@/types';
|
||||
|
||||
// DOM 抓取后的通用结果结构。
|
||||
export type DomScrapeResult = Record<string, unknown>;
|
||||
|
||||
|
||||
/**
|
||||
* 在目标网页上下文中执行 DOM 抓取。
|
||||
*
|
||||
* 注意:该方法会通过 chrome.scripting.executeScript 注入到页面中执行,
|
||||
* 所以依赖的辅助方法都写在函数内部,避免注入后丢失模块作用域。
|
||||
*/
|
||||
export async function scrapeDomFields(fields: PlatformFieldConfig[]): Promise<DomScrapeResult | null> {
|
||||
if (!document.body) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return processFields(fields, document.body);
|
||||
}
|
||||
|
||||
import type {PlatformFieldConfig} from '@/types';
|
||||
|
||||
// 睡眠工具,给点击、翻页、异步渲染留出等待时间。
|
||||
const sleep = (ms?: number) => new Promise((resolve) => window.setTimeout(resolve, ms ?? 1500));
|
||||
@@ -26,12 +7,11 @@ const sleep = (ms?: number) => new Promise((resolve) => window.setTimeout(resolv
|
||||
* 从元素中提取实际值,默认取文本,也支持 attr、图片 src、链接 href。
|
||||
*/
|
||||
function extractValue(el: Element | null, config: PlatformFieldConfig): string | null {
|
||||
if (!el) {
|
||||
return null;
|
||||
if (el == null) {
|
||||
return "未找到"
|
||||
}
|
||||
|
||||
if (config.attr) {
|
||||
return (el.getAttribute(config.attr) || '').trim();
|
||||
return (el.getAttribute(config.attr) || "").trim();
|
||||
}
|
||||
|
||||
const tagName = el.tagName.toUpperCase();
|
||||
@@ -49,19 +29,18 @@ function extractValue(el: Element | null, config: PlatformFieldConfig): string |
|
||||
}
|
||||
|
||||
/**
|
||||
* 自动点击
|
||||
* 根据字段 condition 配置在指定 DOM 范围内自动点击目标元素。
|
||||
*/
|
||||
async function autoClick(config: PlatformFieldConfig, rootDom: ParentNode): Promise<void> {
|
||||
async function autoClick(config: PlatformFieldConfig, rootDom: Element): Promise<void> {
|
||||
if (!config.condition) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const selector of config.condition.list) {
|
||||
const targets = Array.from(rootDom.querySelectorAll<HTMLElement>(selector));
|
||||
|
||||
for (const condition of config.condition.list) {
|
||||
let targets: HTMLElement[] = Array.from(rootDom.querySelectorAll(condition))
|
||||
for (const target of targets) {
|
||||
target.click();
|
||||
await sleep(config.condition.time);
|
||||
await sleep(config?.condition.time);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -69,8 +48,8 @@ async function autoClick(config: PlatformFieldConfig, rootDom: ParentNode): Prom
|
||||
/**
|
||||
* 递归处理字段配置,支持普通字段、嵌套 row、列表和表格。
|
||||
*/
|
||||
export async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode): Promise<DomScrapeResult> {
|
||||
const result: DomScrapeResult = {};
|
||||
export async function processFields(columns: PlatformFieldConfig[], rootDom: Element) {
|
||||
const result = {} as any;
|
||||
|
||||
for (const item of columns) {
|
||||
await autoClick(item, rootDom);
|
||||
@@ -107,10 +86,12 @@ export async function processFields(columns: PlatformFieldConfig[], rootDom: Par
|
||||
}
|
||||
|
||||
/**
|
||||
* 按列表配置抓取所有列表项,并按分页配置继续翻页。
|
||||
* 提取列表的数据
|
||||
* @param config 配置
|
||||
* @param rootDom 父节点
|
||||
*/
|
||||
async function processList(config: PlatformFieldConfig, rootDom: ParentNode): Promise<DomScrapeResult[]> {
|
||||
const allList: DomScrapeResult[] = [];
|
||||
async function processList(config: PlatformFieldConfig, rootDom: ParentNode) {
|
||||
const allList = [];
|
||||
let pageCount = 0;
|
||||
|
||||
while (true) {
|
||||
@@ -159,54 +140,49 @@ async function processList(config: PlatformFieldConfig, rootDom: ParentNode): Pr
|
||||
/**
|
||||
* 按表格配置抓取表格行数据,并按分页配置继续翻页。
|
||||
*/
|
||||
async function processTable(config: PlatformFieldConfig, rootDom: ParentNode): Promise<DomScrapeResult[]> {
|
||||
const allTableData: DomScrapeResult[] = [];
|
||||
async function processTable(config: PlatformFieldConfig, rootDom: ParentNode) {
|
||||
const allTableData: any[] = [];
|
||||
let pageCount = 0;
|
||||
|
||||
while (true) {
|
||||
pageCount += 1;
|
||||
|
||||
const partsNodes: Record<string, Element[]> = {};
|
||||
const partsNodes: any = {};
|
||||
|
||||
for (const part of config.tableParts ?? []) {
|
||||
const partKey = part.name ?? part.label;
|
||||
const partSelector = part.select ?? part.className;
|
||||
const rowSelector = part.rowSelector ?? `${partSelector} tr`;
|
||||
partsNodes[partKey] = Array.from(rootDom.querySelectorAll(rowSelector));
|
||||
}
|
||||
config.tableParts!.forEach(part => {
|
||||
partsNodes[part.name as any] = rootDom.querySelectorAll(`${part.select} tr`);
|
||||
});
|
||||
|
||||
const firstPart = config.tableParts?.[0];
|
||||
const firstPartKey = firstPart ? firstPart.name ?? firstPart.label : '';
|
||||
const rowCount = partsNodes[firstPartKey]?.length || 0;
|
||||
// //以第一个part的行数为准,进行横向扫描
|
||||
const rowCount = partsNodes[config.tableParts![0].name!]?.length || 0
|
||||
|
||||
for (let index = 0; index < rowCount; index += 1) {
|
||||
const rowData: DomScrapeResult = {};
|
||||
|
||||
for (const keyItem of config.keys ?? []) {
|
||||
const partKey = keyItem.part ?? firstPartKey;
|
||||
const targetRowNode = partsNodes[partKey]?.[index];
|
||||
for (let i = 0; i < rowCount; i++) {
|
||||
const rowData: any = {};
|
||||
|
||||
if (!targetRowNode) {
|
||||
continue;
|
||||
}
|
||||
//遍历keys,根据part映射,取对应的里面找
|
||||
for (const keyItem of config.keys!) {
|
||||
const targetRowNode = partsNodes[keyItem.part!][i];
|
||||
|
||||
if (keyItem.keys) {
|
||||
rowData[keyItem.label] = await processFields(keyItem.keys, targetRowNode);
|
||||
} else {
|
||||
rowData[keyItem.label] = extractValue(targetRowNode.querySelector(keyItem.className), keyItem);
|
||||
if (targetRowNode) {
|
||||
//提取值
|
||||
if (keyItem.keys) {
|
||||
rowData[keyItem.label] = await processFields(keyItem.keys, targetRowNode)
|
||||
} else {
|
||||
rowData[keyItem.label] = extractValue(targetRowNode.querySelector(keyItem.className), keyItem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
allTableData.push(rowData);
|
||||
}
|
||||
|
||||
if (!config.pagination) {
|
||||
console.log('未配置分页信息,抓取单页后结束。');
|
||||
console.log("未配置分页信息,抓取单页后结束。");
|
||||
break;
|
||||
}
|
||||
|
||||
if (config.pagination.maxPage && pageCount >= config.pagination.maxPage) {
|
||||
console.log('已达到配置的最大页数,停止。');
|
||||
console.log("已达到配置的最大页数,停止。");
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -231,4 +207,4 @@ async function processTable(config: PlatformFieldConfig, rootDom: ParentNode): P
|
||||
}
|
||||
|
||||
return allTableData;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user