360 lines
13 KiB
TypeScript
360 lines
13 KiB
TypeScript
import {getPlatformById} from "@/config/platforms";
|
||
import {CrawlTaskState, PlatformStepConfig} from "@/types";
|
||
import {openSingleTabWindow, scrapeStepInContent, sleep, waitForTabLoaded} from "@/background/task/helper";
|
||
import {clearCrawlTaskState, getCrawlTaskState, setCrawlTaskState, updateCrawlTaskState} from "./taskState";
|
||
import {sendTabMessage} from "@/shared/tab";
|
||
|
||
|
||
const activeCrawlControllers = new Map<string, AbortController>();
|
||
|
||
/**
|
||
* 创建新的爬取任务,打开目标平台窗口,并把初始时间轴状态写入 storage。
|
||
* @param platformId 平台id
|
||
*/
|
||
export async function startCrawl(platformId: string): Promise<any> {
|
||
const platform = getPlatformById(platformId);
|
||
if (!platform) {
|
||
return {error: '平台配置不存在'};
|
||
}
|
||
|
||
//打开窗口
|
||
let windowInfo = await openSingleTabWindow(platform.steps[0].url)
|
||
//初始化数据
|
||
const startedAt = Date.now();
|
||
const nextState: CrawlTaskState = {
|
||
id: `${platform.id}-${startedAt}`,
|
||
windowId: windowInfo.windowId,
|
||
tabId: windowInfo.tabId,
|
||
platformId: platform.id,
|
||
platformName: platform.name,
|
||
startedAt,
|
||
status: 'running',
|
||
currentStepIndex: 0,
|
||
steps: platform.steps.map((item, index) => {
|
||
return {
|
||
name: item.name,
|
||
uniqueKey: item.uniqueKey,
|
||
status: index === 0 ? 'running' : 'pending',
|
||
}
|
||
})
|
||
};
|
||
|
||
await setCrawlTaskState(nextState);
|
||
|
||
//写入任务,用于取消
|
||
const controller = new AbortController();
|
||
activeCrawlControllers.set(nextState.id, controller);
|
||
//启动
|
||
void runCrawlSteps(nextState.id, nextState.tabId!, platform.steps, controller.signal).finally(() => {
|
||
activeCrawlControllers.delete(nextState.id);
|
||
});
|
||
//自动开始爬取
|
||
return nextState
|
||
}
|
||
|
||
/**
|
||
* 按平台 steps 顺序执行页面跳转、DOM 等待、字段抓取和进度更新。
|
||
* @param steps 平台步骤配置
|
||
* @param signal 中断信号
|
||
*/
|
||
/**
|
||
* 执行器
|
||
*/
|
||
async function runCrawlSteps(taskId: string, tabId: number, steps: PlatformStepConfig[], signal: AbortSignal, startIndex = 0) {
|
||
// 中文备注:startIndex 用于“继续/恢复”场景,从上次没爬完的步骤开始跑。
|
||
for (let i = startIndex; i < steps.length; i += 1) {
|
||
const step = steps[i];
|
||
let shouldRetryStep = true;
|
||
|
||
// 【修改 2】进入新步骤,立刻更新状态机里的索引和步骤状态
|
||
await updateCrawlTaskState(taskId, s => ({
|
||
...s,
|
||
currentStepIndex: i,
|
||
steps: s.steps.map((stepItem, idx) => ({
|
||
...stepItem,
|
||
status: idx === i ? 'running' : stepItem.status
|
||
}))
|
||
}));
|
||
|
||
while (shouldRetryStep) {
|
||
if (signal.aborted) return;
|
||
|
||
// 1. 等待网页加载
|
||
await chrome.tabs.update(tabId, {url: step.url, active: true});
|
||
const loaded = await waitForTabLoaded(tabId, signal);
|
||
if (!loaded) return;
|
||
|
||
// 2. 检测撞盾/抓取
|
||
const res: any = await scrapeStepInContent(tabId, step, signal);
|
||
if (signal.aborted) return;
|
||
|
||
// 3. 处理中断(验证码等)
|
||
if (res.interrupt) {
|
||
await updateCrawlTaskState(taskId, s => ({...s, status: 'paused', pause: res.interrupt}));
|
||
|
||
// 死等恢复
|
||
while ((await getCrawlTaskState())?.status === 'paused') {
|
||
if (signal.aborted) return;
|
||
if (!(await sleep(1000, signal))) return;
|
||
}
|
||
continue; // 恢复后重新触发 while 循环(重刷页面)
|
||
}
|
||
|
||
// 4. 处理结果
|
||
if (res.ok) {
|
||
await updateCrawlTaskState(taskId, s => ({
|
||
...s,
|
||
steps: s.steps.map((item, idx) =>
|
||
idx === i ? {...item, status: 'success', result: res.data} : item
|
||
)
|
||
}));
|
||
shouldRetryStep = false; // 退出 while,准备进下一个 for 循环步骤
|
||
} else {
|
||
// 抓取失败重试
|
||
if (!(await sleep(2000, signal))) return;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 【修改 3】全部步骤完成,标记任务结束
|
||
await updateCrawlTaskState(taskId, s => ({...s, status: 'completed'}));
|
||
|
||
// 中文备注:全部爬取完成后,需要把数据发送给网页,然后清空本次任务记录数据、关掉爬取窗口。
|
||
// 这里由 background 统一做“完成后收尾”,避免 UI 侧各自处理导致状态不同步。
|
||
await finalizeCompletedTask(taskId, signal);
|
||
}
|
||
|
||
|
||
/**
|
||
* 取消当前爬取任务,并尝试关闭正在爬取的平台窗口。
|
||
*/
|
||
export async function cancelCrawl() {
|
||
const state = await getCrawlTaskState();
|
||
|
||
if (!state) return
|
||
|
||
// 立即触发 Abort 信号,让脚本自动停止
|
||
const controller = activeCrawlControllers.get(state.id);
|
||
if (controller) {
|
||
controller.abort();
|
||
activeCrawlControllers.delete(state.id);
|
||
}
|
||
|
||
//清楚缓存
|
||
await clearCrawlTaskState();
|
||
|
||
//关闭窗口
|
||
if (state.windowId) {
|
||
chrome.windows.remove(state.windowId).catch(() => {
|
||
});
|
||
}
|
||
|
||
}
|
||
|
||
/**
|
||
* 当爬取窗口被用户手动关闭时触发:把任务标记为暂停,并中止当前的执行器。
|
||
* 中文备注:这里“暂停”不是取消,任务进度(steps/result/currentStepIndex)会保留,供后续“继续”恢复。
|
||
*/
|
||
export async function pauseCrawlOnWindowRemoved(windowId: number): Promise<void> {
|
||
const state = await getCrawlTaskState();
|
||
if (!state) return;
|
||
if (state.status !== 'running') return;
|
||
if (state.windowId !== windowId) return;
|
||
|
||
// 中文备注:窗口被关掉后继续跑会频繁报 tab 不存在;这里直接 abort 当前 controller,等待用户点击“继续”后重启。
|
||
const controller = activeCrawlControllers.get(state.id);
|
||
if (controller) {
|
||
controller.abort();
|
||
activeCrawlControllers.delete(state.id);
|
||
}
|
||
|
||
await updateCrawlTaskState(state.id, (s) => ({
|
||
...s,
|
||
status: 'paused',
|
||
pause: {
|
||
reason: 'window_closed',
|
||
message: '检测到爬取窗口被关闭。点击“继续”后将重新打开窗口,并从上次进度继续爬取。',
|
||
},
|
||
// 中文备注:窗口/tab 已经不存在,置空避免 UI 侧再尝试聚焦旧窗口。
|
||
windowId: undefined,
|
||
tabId: undefined,
|
||
}));
|
||
}
|
||
|
||
/**
|
||
* 当爬取 tab 被关闭时触发:同样按“窗口被关闭”处理。
|
||
* 中文备注:有些情况下只会触发 tabs.onRemoved,这里单独兜底。
|
||
*/
|
||
export async function pauseCrawlOnTabRemoved(tabId: number): Promise<void> {
|
||
const state = await getCrawlTaskState();
|
||
if (!state) return;
|
||
if (state.status !== 'running') return;
|
||
if (state.tabId !== tabId) return;
|
||
|
||
// 直接复用 window 关闭的暂停逻辑(windowId 可能为空,但不影响暂停)
|
||
const controller = activeCrawlControllers.get(state.id);
|
||
if (controller) {
|
||
controller.abort();
|
||
activeCrawlControllers.delete(state.id);
|
||
}
|
||
|
||
await updateCrawlTaskState(state.id, (s) => ({
|
||
...s,
|
||
status: 'paused',
|
||
pause: {
|
||
reason: 'window_closed',
|
||
message: '检测到爬取页面被关闭。点击“继续”后将重新打开窗口,并从上次进度继续爬取。',
|
||
},
|
||
windowId: undefined,
|
||
tabId: undefined,
|
||
}));
|
||
}
|
||
|
||
/**
|
||
* 继续/恢复暂停的任务。
|
||
* 中文备注:
|
||
* - 如果是登录/验证码导致的暂停:只需要把状态从 paused 切回 running,让原来的执行器继续跑(不重启)。
|
||
* - 如果是窗口被关闭导致的暂停:需要重新打开窗口,并从上次没完成的步骤开始重新跑。
|
||
*/
|
||
export async function resumeCrawl(): Promise<CrawlTaskState | null> {
|
||
const state = await getCrawlTaskState();
|
||
if (!state) return null;
|
||
|
||
if (state.status !== 'paused') {
|
||
return state;
|
||
}
|
||
|
||
// 1) 登录/验证码等中断:窗口仍存在时,直接恢复即可
|
||
if (state.pause?.reason !== 'window_closed' && state.windowId && state.tabId) {
|
||
await updateCrawlTaskState(state.id, (s) => ({...s, status: 'running', pause: undefined}));
|
||
return await getCrawlTaskState();
|
||
}
|
||
|
||
// 2) 窗口关闭导致的暂停:重新打开窗口,并从上次进度继续
|
||
const platform = getPlatformById(state.platformId);
|
||
if (!platform) {
|
||
// 中文备注:平台配置找不到时只能保持暂停态
|
||
return state;
|
||
}
|
||
|
||
const resumeIndex = Math.max(0, Math.min(state.currentStepIndex ?? 0, platform.steps.length - 1));
|
||
|
||
// 中文备注:如果 currentStepIndex 对应 step 已经 success,说明暂停发生在步骤切换间隙,往后找第一个未完成的步骤。
|
||
let startIndex = resumeIndex;
|
||
for (let i = resumeIndex; i < state.steps.length; i += 1) {
|
||
if (state.steps[i]?.status !== 'success') {
|
||
startIndex = i;
|
||
break;
|
||
}
|
||
}
|
||
|
||
const openUrl = platform.steps[startIndex]?.url ?? platform.steps[resumeIndex]?.url ?? platform.steps[0].url;
|
||
const windowInfo = await openSingleTabWindow(openUrl);
|
||
|
||
const nextState: CrawlTaskState = {
|
||
...state,
|
||
windowId: windowInfo.windowId,
|
||
tabId: windowInfo.tabId,
|
||
status: 'running',
|
||
pause: undefined,
|
||
currentStepIndex: startIndex,
|
||
steps: state.steps.map((step, idx) => ({
|
||
...step,
|
||
// 中文备注:继续时把当前要执行的 step 标记为 running(success 不动,避免覆盖已完成步骤)
|
||
status: idx === startIndex && step.status !== 'success' ? 'running' : step.status,
|
||
})),
|
||
};
|
||
|
||
await setCrawlTaskState(nextState);
|
||
|
||
// 中文备注:重启执行器,从 startIndex 开始继续跑
|
||
const controller = new AbortController();
|
||
activeCrawlControllers.set(nextState.id, controller);
|
||
void runCrawlSteps(nextState.id, nextState.tabId!, platform.steps, controller.signal, startIndex).finally(() => {
|
||
activeCrawlControllers.delete(nextState.id);
|
||
});
|
||
|
||
return nextState;
|
||
}
|
||
|
||
/**
|
||
* 关闭/忽略当前任务的 UI 提示(只清空状态,不强制走取消逻辑)。
|
||
* 中文备注:用于 UI 侧把卡片隐藏掉;如果窗口还存在也会顺手关闭,避免残留。
|
||
*/
|
||
export async function dismissCrawl(): Promise<void> {
|
||
const state = await getCrawlTaskState();
|
||
if (!state) {
|
||
await clearCrawlTaskState();
|
||
return;
|
||
}
|
||
|
||
// 中文备注:如果仍有执行器在跑,dismiss 等同取消,避免后台继续执行。
|
||
const controller = activeCrawlControllers.get(state.id);
|
||
if (controller) {
|
||
controller.abort();
|
||
activeCrawlControllers.delete(state.id);
|
||
}
|
||
|
||
await clearCrawlTaskState();
|
||
|
||
if (state.windowId) {
|
||
chrome.windows.remove(state.windowId).catch(() => {
|
||
});
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 完成后的统一收尾:发送结果 -> 清空 storage -> 关闭爬取窗口
|
||
* 中文备注:
|
||
* - “发送给网页”:外部网页(externally_connectable)会通过 storage 广播拿到 completed 状态和结果;
|
||
* - 同时也给爬取 tab 发一份 `CRAWL_COMPLETED`,方便页面内(content script)有需要时直接接收。
|
||
*/
|
||
async function finalizeCompletedTask(taskId: string, signal: AbortSignal) {
|
||
const state = await getCrawlTaskState();
|
||
if (!state || state.id !== taskId) return;
|
||
if (state.status !== 'completed') return;
|
||
|
||
// 1) 发送给爬取 tab(如果 tab 还存在且页面内有监听方)
|
||
if (state.tabId) {
|
||
sendTabMessage(state.tabId, 'CRAWL_COMPLETED', {
|
||
taskId: state.id,
|
||
platformId: state.platformId,
|
||
platformName: state.platformName,
|
||
startedAt: state.startedAt,
|
||
result: collectStepResults(state),
|
||
});
|
||
}
|
||
|
||
// 2) 留一点时间给 storage.onChanged -> external ports 广播完成态(DIANSHAN_CRAWL_DONE)
|
||
// 中文备注:不宜太久,避免完成后窗口迟迟不关;这里 300ms 足够让消息出队。
|
||
await sleep(300, signal);
|
||
|
||
// 3) 清空任务记录(popup 会收到 storage 变化自动重置 UI)
|
||
await clearCrawlTaskState();
|
||
|
||
// 4) 关闭爬取窗口
|
||
if (state.windowId) {
|
||
chrome.windows.remove(state.windowId).catch(() => {
|
||
});
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 收集每个 step 的结果数据,统一输出为 { [uniqueKey]: { ... } } 结构。
|
||
* 中文备注:该结构与 externalBridge.ts 里对外输出一致,方便网页侧消费。
|
||
*/
|
||
function collectStepResults(state: CrawlTaskState): Record<string, unknown> {
|
||
return Object.fromEntries(
|
||
state.steps.map((step) => [
|
||
step.uniqueKey,
|
||
{
|
||
name: step.name,
|
||
status: step.status,
|
||
result: step.result ?? null,
|
||
message: step.message ?? null,
|
||
},
|
||
]),
|
||
);
|
||
}
|
||
|