11
This commit is contained in:
147
src/background/task/crawlTask.ts
Normal file
147
src/background/task/crawlTask.ts
Normal file
@@ -0,0 +1,147 @@
|
||||
import {getPlatformById} from "@/config/platforms";
|
||||
import {CrawlTaskState, PlatformStepConfig} from "@/types";
|
||||
import {openSingleTabWindow, scrapeStepInContent, sleep, waitForTabLoaded} from "@/background/task/helper";
|
||||
import {clearCrawlTaskState, getCrawlTaskState, setCrawlTaskState, updateCrawlTaskState} from "./taskState";
|
||||
|
||||
|
||||
const activeCrawlControllers = new Map<string, AbortController>();
|
||||
|
||||
/**
|
||||
* 创建新的爬取任务,打开目标平台窗口,并把初始时间轴状态写入 storage。
|
||||
* @param platformId 平台id
|
||||
*/
|
||||
export async function startCrawl(platformId: string): Promise<any> {
|
||||
const platform = getPlatformById(platformId);
|
||||
if (!platform) {
|
||||
return {error: '平台配置不存在'};
|
||||
}
|
||||
|
||||
//打开窗口
|
||||
let windowInfo = await openSingleTabWindow(platform.steps[0].url)
|
||||
//初始化数据
|
||||
const startedAt = Date.now();
|
||||
const nextState: CrawlTaskState = {
|
||||
id: `${platform.id}-${startedAt}`,
|
||||
windowId: windowInfo.windowId,
|
||||
tabId: windowInfo.tabId,
|
||||
platformId: platform.id,
|
||||
platformName: platform.name,
|
||||
startedAt,
|
||||
status: 'running',
|
||||
currentStepIndex: 0,
|
||||
steps: platform.steps.map((item, index) => {
|
||||
return {
|
||||
name: item.name,
|
||||
uniqueKey: item.uniqueKey,
|
||||
status: index === 0 ? 'running' : 'pending',
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
await setCrawlTaskState(nextState);
|
||||
|
||||
//写入任务,用于取消
|
||||
const controller = new AbortController();
|
||||
activeCrawlControllers.set(nextState.id, controller);
|
||||
//启动
|
||||
void runCrawlSteps(nextState.id, nextState.tabId!, platform.steps, controller.signal).finally(() => {
|
||||
activeCrawlControllers.delete(nextState.id);
|
||||
});
|
||||
//自动开始爬取
|
||||
return nextState
|
||||
}
|
||||
|
||||
/**
|
||||
* 按平台 steps 顺序执行页面跳转、DOM 等待、字段抓取和进度更新。
|
||||
* @param steps 平台步骤配置
|
||||
* @param signal 中断信号
|
||||
*/
|
||||
/**
|
||||
* 执行器
|
||||
*/
|
||||
async function runCrawlSteps(taskId: string, tabId: number, steps: PlatformStepConfig[], signal: AbortSignal) {
|
||||
for (let i = 0; i < steps.length; i += 1) {
|
||||
const step = steps[i];
|
||||
let shouldRetryStep = true;
|
||||
|
||||
// 【修改 2】进入新步骤,立刻更新状态机里的索引和步骤状态
|
||||
await updateCrawlTaskState(taskId, s => ({
|
||||
...s,
|
||||
currentStepIndex: i,
|
||||
steps: s.steps.map((stepItem, idx) => ({
|
||||
...stepItem,
|
||||
status: idx === i ? 'running' : stepItem.status
|
||||
}))
|
||||
}));
|
||||
|
||||
while (shouldRetryStep) {
|
||||
if (signal.aborted) return;
|
||||
|
||||
// 1. 等待网页加载
|
||||
await chrome.tabs.update(tabId, {url: step.url, active: true});
|
||||
const loaded = await waitForTabLoaded(tabId, signal);
|
||||
if (!loaded) return;
|
||||
|
||||
// 2. 检测撞盾/抓取
|
||||
const res: any = await scrapeStepInContent(tabId, step, signal);
|
||||
if (signal.aborted) return;
|
||||
|
||||
// 3. 处理中断(验证码等)
|
||||
if (res.interrupt) {
|
||||
await updateCrawlTaskState(taskId, s => ({...s, status: 'paused', pause: res.interrupt}));
|
||||
|
||||
// 死等恢复
|
||||
while ((await getCrawlTaskState())?.status === 'paused') {
|
||||
if (signal.aborted) return;
|
||||
if (!(await sleep(1000, signal))) return;
|
||||
}
|
||||
continue; // 恢复后重新触发 while 循环(重刷页面)
|
||||
}
|
||||
|
||||
// 4. 处理结果
|
||||
if (res.ok) {
|
||||
await updateCrawlTaskState(taskId, s => ({
|
||||
...s,
|
||||
steps: s.steps.map((item, idx) =>
|
||||
idx === i ? {...item, status: 'success', result: res.data} : item
|
||||
)
|
||||
}));
|
||||
shouldRetryStep = false; // 退出 while,准备进下一个 for 循环步骤
|
||||
} else {
|
||||
// 抓取失败重试
|
||||
if (!(await sleep(2000, signal))) return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 【修改 3】全部步骤完成,标记任务结束
|
||||
await updateCrawlTaskState(taskId, s => ({...s, status: 'completed'}));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 取消当前爬取任务,并尝试关闭正在爬取的平台窗口。
|
||||
*/
|
||||
export async function cancelCrawl() {
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (!state) return
|
||||
|
||||
// 立即触发 Abort 信号,让脚本自动停止
|
||||
const controller = activeCrawlControllers.get(state.id);
|
||||
if (controller) {
|
||||
controller.abort();
|
||||
activeCrawlControllers.delete(state.id);
|
||||
}
|
||||
|
||||
//清楚缓存
|
||||
await clearCrawlTaskState();
|
||||
|
||||
//关闭窗口
|
||||
if (state.windowId) {
|
||||
chrome.windows.remove(state.windowId).catch(() => {
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user