269 lines
8.8 KiB
JavaScript
269 lines
8.8 KiB
JavaScript
/**
|
||
* type:0普通元素(默认),1列表,2表格(带分页)
|
||
* condition:{
|
||
* list:[] 点击条件
|
||
* time:2000 点击后的等待世界
|
||
* }
|
||
* keys:子元素,如果type是0则是普通的键值,否则是数组键值
|
||
* tableParts:表格专用:兼容多table或分段table的情况
|
||
* pagination:分页配置,
|
||
*/
|
||
|
||
/**
|
||
* 数据类型
|
||
* 1. 纯文字或图片
|
||
* 2. 列表类型
|
||
* 3.row布局下的子元素(综合1和2的)
|
||
* 4. 列表
|
||
*/
|
||
|
||
(async function () {
|
||
let column = [
|
||
{
|
||
label: "低星评论",
|
||
className: ".border-solid.rounded",
|
||
condition: {
|
||
list: [
|
||
".flex.items-center.mt-6 div:nth-child(3)",
|
||
".eds-react-checkbox-group label:nth-child(2)",
|
||
".eds-react-checkbox-group label:nth-child(3)",
|
||
".eds-react-checkbox-group label:nth-child(4)"
|
||
],
|
||
time: 200,
|
||
},
|
||
type: 1,
|
||
keys: [
|
||
{
|
||
label: "用户",
|
||
className: ".flex.items-center.justify-start .ml-2"
|
||
},
|
||
{
|
||
label: "订单编号",
|
||
className: ".underline.px-1"
|
||
},
|
||
{
|
||
label: "商品名称",
|
||
className: ".min-w-0.font-medium.break-all"
|
||
},
|
||
{
|
||
label: "规格",
|
||
className: ".min-w-0.font-medium.break-all + div"
|
||
},
|
||
{
|
||
label: "评价内容",
|
||
className: ".min-w-0.overflow-hidden",
|
||
condition: {
|
||
list: [
|
||
"span.cursor-pointer"
|
||
],
|
||
time: 200,
|
||
},
|
||
|
||
},
|
||
],
|
||
pagination: {
|
||
nextBtn: ".eds-react-pagination-pager__button-next",
|
||
maxPage: 2, // 最大爬取页数
|
||
delay: 2000 // 翻页后的等待加载时间
|
||
},
|
||
|
||
},
|
||
]
|
||
//自定义睡眠
|
||
const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms || 1500));
|
||
|
||
/**
|
||
* 递归结构
|
||
* @param {*列表} columns
|
||
* @param {* 父dom节点} dom
|
||
* @returns
|
||
*/
|
||
async function process(columns, dom) {
|
||
if (!dom) return null;
|
||
let result = {}
|
||
for (const item of columns) {
|
||
//判断条件,如果存在执行点击
|
||
await autoClick(item, dom)
|
||
|
||
const element = dom.querySelector(item.className);
|
||
//如果不存在
|
||
if (!element) {
|
||
result[item.label] = "没找到该元素"
|
||
continue;
|
||
}
|
||
//如果是普通元素
|
||
if (!item.type) {
|
||
//如果是row布局
|
||
if (item.keys && item.keys.length > 0) {
|
||
await autoClick(item, element)
|
||
result[item.label] = await process(item.keys, element);
|
||
} else {
|
||
await autoClick(item, element)
|
||
//正常取值
|
||
result[item.label] = extractValue(element, item);
|
||
}
|
||
} else if (item.type == 1) {
|
||
result[item.label] = await processList(item, dom)
|
||
} else if (item.type == 2) {
|
||
result[item.label] = await processTable(item, element)
|
||
}
|
||
}
|
||
|
||
return result
|
||
}
|
||
|
||
/**
|
||
* 触发点击事件
|
||
*/
|
||
async function autoClick(config, rootDom) {
|
||
if (config?.condition) {
|
||
for (const condition of config.condition.list) {
|
||
let targets = rootDom.querySelectorAll(condition)
|
||
for (const target of targets) {
|
||
target.click();
|
||
await sleep(config?.condition.time);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 提取具体值的辅助函数
|
||
*/
|
||
function extractValue(el, config) {
|
||
// 如果指定提取某个属性(如 class, href, src, data-v 等)
|
||
if (config.attr) {
|
||
return (el.getAttribute(config.attr) || "").trim();
|
||
}
|
||
if (el == null) {
|
||
return "未找到"
|
||
}
|
||
|
||
const tagName = el.tagName;
|
||
if (tagName === "IMG") return el.getAttribute("src");
|
||
if (tagName === "A") {
|
||
let href = el.getAttribute("href");
|
||
return href && !href.startsWith("http") ? window.location.origin + href : href;
|
||
}
|
||
|
||
// 默认提取文字,并清洗
|
||
return el.innerText.replace(/\n/g, "").trim();
|
||
}
|
||
|
||
/**
|
||
* 提取列表的数据
|
||
* @param {*配置} config
|
||
* @param {*父节点} rootDom
|
||
*/
|
||
async function processList(config, rootDom) {
|
||
let allList = [];
|
||
let pageCount = 0;
|
||
while (true) {
|
||
pageCount++;
|
||
const allElements = rootDom.querySelectorAll(config.className);
|
||
|
||
const elements = Array.from(allElements);
|
||
for (const element of elements) {
|
||
let itemData = await process(config.keys, element)
|
||
allList.push(itemData)
|
||
}
|
||
//1.如果没有配置分页,抓一页自动退出
|
||
if (!config.pagination) {
|
||
console.log("未配置分页信息,抓取单页后结束。");
|
||
break;
|
||
}
|
||
// 2.如果达到最大页数限制,强制停止
|
||
if (config.pagination.maxPage && pageCount >= config.pagination.maxPage) {
|
||
console.log("已达到配置的最大页数,停止。");
|
||
break;
|
||
}
|
||
// 3. 如果找不到下一页按钮,结束
|
||
const nextBtn = document.querySelector(config.pagination.nextBtn);
|
||
if (!nextBtn) {
|
||
console.log("未找到下一页按钮,抓取结束。");
|
||
break;
|
||
} else {
|
||
nextBtn.click();
|
||
await sleep(config.pagination.delay);
|
||
}
|
||
}
|
||
return allList
|
||
|
||
}
|
||
|
||
/**
|
||
* 提取表格的数据
|
||
*/
|
||
async function processTable(config, rootDom) {
|
||
let allTableData = [];
|
||
let pageCount = 0;
|
||
|
||
while (true) {
|
||
pageCount++;
|
||
//锁定所有 Table Parts 的 tr
|
||
const partsNodes = {};
|
||
|
||
config.tableParts.forEach(part => {
|
||
partsNodes[part.name] = rootDom.querySelectorAll(`${part.select} tr`);
|
||
});
|
||
|
||
|
||
// //以第一个part的行数为准,进行横向扫描
|
||
const rowCount = partsNodes[config.tableParts[0].name]?.length || 0
|
||
for (let i = 0; i < rowCount; i++) {
|
||
let rowData = {};
|
||
//遍历keys,根据part映射,取对应的里面找
|
||
for (const keyItem of config.keys) {
|
||
const targetRowNode = partsNodes[keyItem.part][i];
|
||
|
||
if (targetRowNode) {
|
||
//提取值
|
||
if (keyItem.keys) {
|
||
rowData[keyItem.label] = await process(keyItem.keys, targetRowNode)
|
||
} else {
|
||
rowData[keyItem.label] = extractValue(targetRowNode.querySelector(keyItem.className), keyItem);
|
||
}
|
||
}
|
||
}
|
||
allTableData.push(rowData);
|
||
}
|
||
//1.如果没有配置分页,抓一页自动退出
|
||
if (!config.pagination) {
|
||
console.log("未配置分页信息,抓取单页后结束。");
|
||
break;
|
||
}
|
||
// 2.如果达到最大页数限制,强制停止
|
||
if (config.pagination.maxPage && pageCount >= config.pagination.maxPage) {
|
||
console.log("已达到配置的最大页数,停止。");
|
||
break;
|
||
}
|
||
// 3. 如果找不到下一页按钮,结束
|
||
const nextBtn = document.querySelector(config.pagination.nextBtn);
|
||
if (!nextBtn) {
|
||
console.log("未找到下一页按钮,抓取结束。");
|
||
break;
|
||
}
|
||
// 4.检擦按钮是否被禁用
|
||
const isDisabled = config.pagination.disabledClass ? nextBtn.classList.contains(config.pagination.disabledClass) : nextBtn.disabled;
|
||
|
||
if (isDisabled) {
|
||
console.log("下一页按钮已禁用,抓取结束。");
|
||
break;
|
||
}
|
||
|
||
//下一页
|
||
nextBtn.click();
|
||
await sleep(config.pagination.delay);
|
||
}
|
||
return allTableData;
|
||
}
|
||
|
||
|
||
|
||
let data = await process(column, document.body)
|
||
|
||
|
||
console.log("==== 提取成功 ====");
|
||
console.log(data);
|
||
return data
|
||
})() |