1
This commit is contained in:
@@ -0,0 +1,226 @@
|
||||
import type { PlatformFieldConfig } from '@/types';
|
||||
|
||||
/** DOM 抓取后的通用结果结构。 */
|
||||
export type DomScrapeResult = Record<string, unknown>;
|
||||
|
||||
/** 默认等待时间,用于点击后或翻页后等待页面渲染。 */
|
||||
const DEFAULT_DELAY = 1500;
|
||||
|
||||
/**
|
||||
* 在目标网页上下文中执行 DOM 抓取。
|
||||
*
|
||||
* 注意:该方法会通过 chrome.scripting.executeScript 注入到页面中执行,
|
||||
* 所以依赖的辅助方法都写在函数内部,避免注入后丢失模块作用域。
|
||||
*/
|
||||
export async function scrapeDomFields(fields: PlatformFieldConfig[]): Promise<DomScrapeResult | null> {
|
||||
if (!document.body) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return processFields(fields, document.body);
|
||||
}
|
||||
|
||||
|
||||
/** 睡眠工具,给点击、翻页、异步渲染留出等待时间。 */
|
||||
const sleep = (ms?: number) => new Promise((resolve) => window.setTimeout(resolve, ms ?? DEFAULT_DELAY));
|
||||
|
||||
/** 从元素中提取实际值,默认取文本,也支持 attr、图片 src、链接 href。 */
|
||||
function extractValue(el: Element | null, config: PlatformFieldConfig): string | null {
|
||||
if (!el) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (config.attr) {
|
||||
return (el.getAttribute(config.attr) || '').trim();
|
||||
}
|
||||
|
||||
const tagName = el.tagName.toUpperCase();
|
||||
|
||||
if (tagName === 'IMG') {
|
||||
return el.getAttribute('src');
|
||||
}
|
||||
|
||||
if (tagName === 'A') {
|
||||
const href = el.getAttribute('href');
|
||||
return href && !href.startsWith('http') ? window.location.origin + href : href;
|
||||
}
|
||||
|
||||
return (el.textContent || '').replace(/\n/g, '').trim();
|
||||
}
|
||||
|
||||
/** 根据字段 condition 配置在指定 DOM 范围内自动点击目标元素。 */
|
||||
async function autoClick(config: PlatformFieldConfig, rootDom: ParentNode): Promise<void> {
|
||||
if (!config.condition) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const selector of config.condition.list) {
|
||||
const targets = Array.from(rootDom.querySelectorAll<HTMLElement>(selector));
|
||||
|
||||
for (const target of targets) {
|
||||
target.click();
|
||||
await sleep(config.condition.time);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** 递归处理字段配置,支持普通字段、嵌套 row、列表和表格。 */
|
||||
async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode): Promise<DomScrapeResult> {
|
||||
const result: DomScrapeResult = {};
|
||||
|
||||
for (const item of columns) {
|
||||
await autoClick(item, rootDom);
|
||||
|
||||
const element = rootDom.querySelector(item.className);
|
||||
|
||||
if (!element) {
|
||||
result[item.label] = '没找到该元素';
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!item.type) {
|
||||
if (item.keys && item.keys.length > 0) {
|
||||
await autoClick(item, element);
|
||||
result[item.label] = await processFields(item.keys, element);
|
||||
} else {
|
||||
await autoClick(item, element);
|
||||
result[item.label] = extractValue(element, item);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (item.type === 1) {
|
||||
result[item.label] = await processList(item, rootDom);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (item.type === 2) {
|
||||
result[item.label] = await processTable(item, element);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/** 按列表配置抓取所有列表项,并按分页配置继续翻页。 */
|
||||
async function processList(config: PlatformFieldConfig, rootDom: ParentNode): Promise<DomScrapeResult[]> {
|
||||
const allList: DomScrapeResult[] = [];
|
||||
let pageCount = 0;
|
||||
|
||||
while (true) {
|
||||
pageCount += 1;
|
||||
|
||||
const elements = Array.from(rootDom.querySelectorAll(config.className));
|
||||
|
||||
for (const element of elements) {
|
||||
const itemData = await processFields(config.keys ?? [], element);
|
||||
allList.push(itemData);
|
||||
}
|
||||
|
||||
if (!config.pagination) {
|
||||
console.log('未配置分页信息,抓取单页后结束。');
|
||||
break;
|
||||
}
|
||||
|
||||
if (config.pagination.maxPage && pageCount >= config.pagination.maxPage) {
|
||||
console.log('已达到配置的最大页数,停止。');
|
||||
break;
|
||||
}
|
||||
|
||||
const nextBtn = document.querySelector<HTMLElement>(config.pagination.nextBtn);
|
||||
|
||||
if (!nextBtn) {
|
||||
console.log('未找到下一页按钮,抓取结束。');
|
||||
break;
|
||||
}
|
||||
|
||||
const isDisabled = config.pagination.disabledClass
|
||||
? nextBtn.classList.contains(config.pagination.disabledClass)
|
||||
: nextBtn.hasAttribute('disabled');
|
||||
|
||||
if (isDisabled) {
|
||||
console.log('下一页按钮已禁用,抓取结束。');
|
||||
break;
|
||||
}
|
||||
|
||||
nextBtn.click();
|
||||
await sleep(config.pagination.delay);
|
||||
}
|
||||
|
||||
return allList;
|
||||
}
|
||||
|
||||
/** 按表格配置抓取表格行数据,并按分页配置继续翻页。 */
|
||||
async function processTable(config: PlatformFieldConfig, rootDom: ParentNode): Promise<DomScrapeResult[]> {
|
||||
const allTableData: DomScrapeResult[] = [];
|
||||
let pageCount = 0;
|
||||
|
||||
while (true) {
|
||||
pageCount += 1;
|
||||
|
||||
const partsNodes: Record<string, Element[]> = {};
|
||||
|
||||
for (const part of config.tableParts ?? []) {
|
||||
const partKey = part.name ?? part.label;
|
||||
const partSelector = part.select ?? part.className;
|
||||
const rowSelector = part.rowSelector ?? `${partSelector} tr`;
|
||||
partsNodes[partKey] = Array.from(rootDom.querySelectorAll(rowSelector));
|
||||
}
|
||||
|
||||
const firstPart = config.tableParts?.[0];
|
||||
const firstPartKey = firstPart ? firstPart.name ?? firstPart.label : '';
|
||||
const rowCount = partsNodes[firstPartKey]?.length || 0;
|
||||
|
||||
for (let index = 0; index < rowCount; index += 1) {
|
||||
const rowData: DomScrapeResult = {};
|
||||
|
||||
for (const keyItem of config.keys ?? []) {
|
||||
const partKey = keyItem.part ?? firstPartKey;
|
||||
const targetRowNode = partsNodes[partKey]?.[index];
|
||||
|
||||
if (!targetRowNode) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (keyItem.keys) {
|
||||
rowData[keyItem.label] = await processFields(keyItem.keys, targetRowNode);
|
||||
} else {
|
||||
rowData[keyItem.label] = extractValue(targetRowNode.querySelector(keyItem.className), keyItem);
|
||||
}
|
||||
}
|
||||
|
||||
allTableData.push(rowData);
|
||||
}
|
||||
|
||||
if (!config.pagination) {
|
||||
console.log('未配置分页信息,抓取单页后结束。');
|
||||
break;
|
||||
}
|
||||
|
||||
if (config.pagination.maxPage && pageCount >= config.pagination.maxPage) {
|
||||
console.log('已达到配置的最大页数,停止。');
|
||||
break;
|
||||
}
|
||||
|
||||
const nextBtn = document.querySelector<HTMLElement>(config.pagination.nextBtn);
|
||||
|
||||
if (!nextBtn) {
|
||||
console.log('未找到下一页按钮,抓取结束。');
|
||||
break;
|
||||
}
|
||||
|
||||
const isDisabled = config.pagination.disabledClass
|
||||
? nextBtn.classList.contains(config.pagination.disabledClass)
|
||||
: nextBtn.hasAttribute('disabled');
|
||||
|
||||
if (isDisabled) {
|
||||
console.log('下一页按钮已禁用,抓取结束。');
|
||||
break;
|
||||
}
|
||||
|
||||
nextBtn.click();
|
||||
await sleep(config.pagination.delay);
|
||||
}
|
||||
|
||||
return allTableData;
|
||||
}
|
||||
Reference in New Issue
Block a user