This commit is contained in:
zhu
2026-05-12 15:26:17 +08:00
parent cf7ea741a6
commit c7cb977243
14 changed files with 507 additions and 944 deletions

View File

@@ -0,0 +1,147 @@
import {getPlatformById} from "@/config/platforms";
import {CrawlTaskState, PlatformStepConfig} from "@/types";
import {openSingleTabWindow, scrapeStepInContent, sleep, waitForTabLoaded} from "@/background/task/helper";
import {clearCrawlTaskState, getCrawlTaskState, setCrawlTaskState, updateCrawlTaskState} from "./taskState";
const activeCrawlControllers = new Map<string, AbortController>();
/**
* 创建新的爬取任务,打开目标平台窗口,并把初始时间轴状态写入 storage。
* @param platformId 平台id
*/
export async function startCrawl(platformId: string): Promise<any> {
const platform = getPlatformById(platformId);
if (!platform) {
return {error: '平台配置不存在'};
}
//打开窗口
let windowInfo = await openSingleTabWindow(platform.steps[0].url)
//初始化数据
const startedAt = Date.now();
const nextState: CrawlTaskState = {
id: `${platform.id}-${startedAt}`,
windowId: windowInfo.windowId,
tabId: windowInfo.tabId,
platformId: platform.id,
platformName: platform.name,
startedAt,
status: 'running',
currentStepIndex: 0,
steps: platform.steps.map((item, index) => {
return {
name: item.name,
uniqueKey: item.uniqueKey,
status: index === 0 ? 'running' : 'pending',
}
})
};
await setCrawlTaskState(nextState);
//写入任务,用于取消
const controller = new AbortController();
activeCrawlControllers.set(nextState.id, controller);
//启动
void runCrawlSteps(nextState.id, nextState.tabId!, platform.steps, controller.signal).finally(() => {
activeCrawlControllers.delete(nextState.id);
});
//自动开始爬取
return nextState
}
/**
* 按平台 steps 顺序执行页面跳转、DOM 等待、字段抓取和进度更新。
* @param steps 平台步骤配置
* @param signal 中断信号
*/
/**
* 执行器
*/
async function runCrawlSteps(taskId: string, tabId: number, steps: PlatformStepConfig[], signal: AbortSignal) {
for (let i = 0; i < steps.length; i += 1) {
const step = steps[i];
let shouldRetryStep = true;
// 【修改 2】进入新步骤立刻更新状态机里的索引和步骤状态
await updateCrawlTaskState(taskId, s => ({
...s,
currentStepIndex: i,
steps: s.steps.map((stepItem, idx) => ({
...stepItem,
status: idx === i ? 'running' : stepItem.status
}))
}));
while (shouldRetryStep) {
if (signal.aborted) return;
// 1. 等待网页加载
await chrome.tabs.update(tabId, {url: step.url, active: true});
const loaded = await waitForTabLoaded(tabId, signal);
if (!loaded) return;
// 2. 检测撞盾/抓取
const res: any = await scrapeStepInContent(tabId, step, signal);
if (signal.aborted) return;
// 3. 处理中断(验证码等)
if (res.interrupt) {
await updateCrawlTaskState(taskId, s => ({...s, status: 'paused', pause: res.interrupt}));
// 死等恢复
while ((await getCrawlTaskState())?.status === 'paused') {
if (signal.aborted) return;
if (!(await sleep(1000, signal))) return;
}
continue; // 恢复后重新触发 while 循环(重刷页面)
}
// 4. 处理结果
if (res.ok) {
await updateCrawlTaskState(taskId, s => ({
...s,
steps: s.steps.map((item, idx) =>
idx === i ? {...item, status: 'success', result: res.data} : item
)
}));
shouldRetryStep = false; // 退出 while准备进下一个 for 循环步骤
} else {
// 抓取失败重试
if (!(await sleep(2000, signal))) return;
}
}
}
// 【修改 3】全部步骤完成标记任务结束
await updateCrawlTaskState(taskId, s => ({...s, status: 'completed'}));
}
/**
* 取消当前爬取任务,并尝试关闭正在爬取的平台窗口。
*/
export async function cancelCrawl() {
const state = await getCrawlTaskState();
if (!state) return
// 立即触发 Abort 信号,让脚本自动停止
const controller = activeCrawlControllers.get(state.id);
if (controller) {
controller.abort();
activeCrawlControllers.delete(state.id);
}
//清楚缓存
await clearCrawlTaskState();
//关闭窗口
if (state.windowId) {
chrome.windows.remove(state.windowId).catch(() => {
});
}
}