This commit is contained in:
zhu
2026-05-09 17:48:31 +08:00
parent 30f9467cc8
commit 186840ba23
10 changed files with 330 additions and 118 deletions

View File

@@ -1,5 +1,7 @@
import type {PlatformFieldConfig} from '@/types';
export type DomScrapeResult = Record<string, unknown>;
/**
* 等待重试机制
*/
@@ -225,4 +227,4 @@ async function processTable(config: PlatformFieldConfig, rootDom: ParentNode) {
}
return allTableData;
}
}

View File

@@ -1,7 +1,13 @@
import {handleBackgroundCommand, handleWindowRemoved} from './service';
import type {BackgroundCommand} from './types';
import { handleBackgroundCommand, handleInstalled, handleStartup, handleWindowRemoved } from './service';
import type { BackgroundCommand } from './types';
chrome.runtime.onInstalled.addListener(() => {
void handleInstalled();
});
chrome.runtime.onStartup.addListener(() => {
void handleStartup();
});
chrome.runtime.onMessage.addListener((message: BackgroundCommand, _sender, sendResponse) => {
void handleBackgroundMessage(message, sendResponse);
@@ -12,20 +18,19 @@ chrome.windows.onRemoved.addListener((windowId) => {
void handleWindowRemoved(windowId);
});
chrome.runtime.onMessageExternal.addListener((message, sender, sendResponse) => {
if (message.type === "STORE_AI_PING") {
// 返回版本号等信息
chrome.runtime.onMessageExternal.addListener((message, _sender, sendResponse) => {
if (message.type === 'STORE_AI_PING') {
sendResponse({
success: true,
version: chrome.runtime.getManifest().version
version: chrome.runtime.getManifest().version,
});
}
// 注意:外部消息处理必须返回 true 才能支持异步 sendResponse
return true;
});
/**
* 统一包装后台消息处理,确保异步错误能回给调用方。
* Wrap background command handling so async errors can still be returned to callers.
*/
async function handleBackgroundMessage(
message: BackgroundCommand,
@@ -36,6 +41,6 @@ async function handleBackgroundMessage(
sendResponse(result);
} catch (error: unknown) {
const messageText = error instanceof Error ? error.message : 'Unknown error';
sendResponse({ok: false, error: messageText});
sendResponse({ ok: false, data: null, error: messageText });
}
}

View File

@@ -2,7 +2,7 @@ import { getPlatformById } from '@/config/platforms';
import type { CrawlPauseInfo, CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types';
import type { DomScrapeResult } from '../domScraper';
import type { CrawlStateResponse } from '../types';
import { getCrawlTaskState, setCrawlTaskState, updateCrawlTaskState } from './taskState';
import { clearCrawlTaskState, getCrawlTaskState, setCrawlTaskState, updateCrawlTaskState } from './taskState';
interface PageRunnerResponse {
ok: boolean;
@@ -11,6 +11,8 @@ interface PageRunnerResponse {
error?: string;
}
const activeCrawlControllers = new Map<string, AbortController>();
/**
* 创建新的爬取任务,打开目标平台窗口,并把初始时间轴状态写入 storage。
*/
@@ -47,9 +49,13 @@ export async function startCrawl(platformId: string): Promise<CrawlStateResponse
try {
const windowInfo = await createCrawlWindow(firstStep.url);
const stateWithWindow = { ...nextState, windowId: windowInfo.id };
const controller = new AbortController();
await setCrawlTaskState(stateWithWindow);
void runCrawlSteps(platform, stateWithWindow);
activeCrawlControllers.set(stateWithWindow.id, controller);
void runCrawlSteps(platform, stateWithWindow, controller.signal).finally(() => {
activeCrawlControllers.delete(stateWithWindow.id);
});
return { ok: true, data: stateWithWindow };
} catch (error: unknown) {
@@ -76,21 +82,15 @@ export async function cancelCrawl(): Promise<CrawlStateResponse> {
return { ok: true, data: null };
}
const canceledState: CrawlTaskState = {
...state,
status: 'canceled',
steps: state.steps.map((step, index) =>
index === state.currentStepIndex ? { ...step, status: 'failed', message: '用户已取消' } : step,
),
};
abortActiveCrawl(state.id);
await setCrawlTaskState(canceledState);
await clearCrawlTaskState();
if (state.windowId) {
await chrome.windows.remove(state.windowId).catch(() => undefined);
}
return { ok: true, data: canceledState };
return { ok: true, data: null };
}
/**
@@ -122,10 +122,12 @@ export async function resumeCrawl(): Promise<CrawlStateResponse> {
export async function cancelCrawlWhenWindowRemoved(windowId: number): Promise<void> {
const state = await getCrawlTaskState();
if (state?.windowId !== windowId || state.status !== 'running') {
if (state?.windowId !== windowId || !['running', 'paused'].includes(state.status)) {
return;
}
abortActiveCrawl(state.id);
await setCrawlTaskState({
...state,
status: 'canceled',
@@ -135,10 +137,38 @@ export async function cancelCrawlWhenWindowRemoved(windowId: number): Promise<vo
});
}
export async function cancelStaleCrawlWhenWindowMissing(): Promise<void> {
const state = await getCrawlTaskState();
if (!state || !['running', 'paused'].includes(state.status)) {
return;
}
const isWindowAlive = state.windowId ? await hasWindow(state.windowId) : false;
if (isWindowAlive) {
return;
}
abortActiveCrawl(state.id);
await setCrawlTaskState({
...state,
status: 'canceled',
steps: state.steps.map((step, index) =>
index === state.currentStepIndex ? { ...step, status: 'failed', message: '爬取窗口已关闭,任务已取消' } : step,
),
});
}
function abortActiveCrawl(taskId: string): void {
activeCrawlControllers.get(taskId)?.abort();
}
/**
* 按平台 steps 顺序执行页面跳转、DOM 等待、字段抓取和进度更新。
*/
async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskState): Promise<void> {
async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskState, signal: AbortSignal): Promise<void> {
if (!initialState.windowId) {
return;
}
@@ -151,12 +181,12 @@ async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskSt
while (shouldRetryStep) {
const currentState = await getCrawlTaskState();
if (currentState?.id !== initialState.id || currentState.status === 'canceled') {
if (signal.aborted || currentState?.id !== initialState.id || currentState.status === 'canceled') {
return;
}
if (currentState.status === 'paused') {
const resumed = await waitUntilResumed(initialState.id);
const resumed = await waitUntilResumed(initialState.id, signal);
if (!resumed) {
return;
@@ -177,13 +207,21 @@ async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskSt
const tabId = await getWindowActiveTabId(initialState.windowId);
await chrome.tabs.update(tabId, { url: step.url, active: true });
await waitForTabLoaded(tabId);
const tabLoaded = await waitForTabLoaded(tabId, signal);
const response = await scrapeStepInContent(tabId, step);
if (!tabLoaded || signal.aborted) {
return;
}
const response = await scrapeStepInContent(tabId, step, signal);
if (signal.aborted) {
return;
}
if (response.interrupt) {
await pauseForInterrupt(initialState.id, stepIndex, response.interrupt);
const resumed = await waitUntilResumed(initialState.id);
const resumed = await waitUntilResumed(initialState.id, signal);
if (!resumed) {
return;
@@ -258,23 +296,33 @@ async function getWindowActiveTabId(windowId: number): Promise<number> {
/**
* 让 content script 直接在目标页面执行检查和抓取。
*/
async function scrapeStepInContent(tabId: number, step: PlatformStepConfig): Promise<PageRunnerResponse> {
async function scrapeStepInContent(
tabId: number,
step: PlatformStepConfig,
signal: AbortSignal,
): Promise<PageRunnerResponse> {
const startedAt = Date.now();
while (Date.now() - startedAt < 20000) {
if (signal.aborted) {
return { ok: false, error: 'canceled' };
}
const response = await sendPageRunnerMessage(tabId, {
action: 'SCRAPE_STEP',
payload: {
fields: step.fields,
checkSelector: step.checkSelector,
},
});
}, signal);
if (response.ok || response.interrupt || !isPageRunnerNotReadyError(response.error)) {
return response;
}
await sleep(500);
if (!(await sleep(500, signal))) {
return { ok: false, error: 'canceled' };
}
}
return { ok: false, error: '页面脚本未响应,请刷新扩展后重试' };
@@ -283,7 +331,15 @@ async function scrapeStepInContent(tabId: number, step: PlatformStepConfig): Pro
/**
* 给目标页的 content script 发送页面执行消息。
*/
async function sendPageRunnerMessage(tabId: number, message: unknown): Promise<PageRunnerResponse> {
async function sendPageRunnerMessage(tabId: number, message: unknown, signal: AbortSignal): Promise<PageRunnerResponse> {
if (signal.aborted) {
return { ok: false, error: 'canceled' };
}
return raceWithAbort(sendPageRunnerMessageOnce(tabId, message), signal);
}
async function sendPageRunnerMessageOnce(tabId: number, message: unknown): Promise<PageRunnerResponse> {
try {
const response = await chrome.tabs.sendMessage(tabId, message);
@@ -326,8 +382,12 @@ async function pauseForInterrupt(taskId: string, stepIndex: number, interrupt: C
/**
* 暂停后等待用户点继续或取消。
*/
async function waitUntilResumed(taskId: string): Promise<boolean> {
async function waitUntilResumed(taskId: string, signal: AbortSignal): Promise<boolean> {
while (true) {
if (signal.aborted) {
return false;
}
const state = await getCrawlTaskState();
if (!state || state.id !== taskId || state.status === 'canceled' || state.status === 'failed') {
@@ -338,7 +398,9 @@ async function waitUntilResumed(taskId: string): Promise<boolean> {
return true;
}
await sleep(1000);
if (!(await sleep(1000, signal))) {
return false;
}
}
}
@@ -377,30 +439,123 @@ function createCrawlWindow(url: string): Promise<chrome.windows.Window> {
/**
* 等待 tab 完成页面加载。
*/
function waitForTabLoaded(tabId: number): Promise<void> {
function waitForTabLoaded(tabId: number, signal: AbortSignal): Promise<boolean> {
return new Promise((resolve) => {
if (signal.aborted) {
resolve(false);
return;
}
const timeout = globalThis.setTimeout(() => {
chrome.tabs.onUpdated.removeListener(handleUpdated);
resolve();
cleanup();
resolve(true);
}, 15000);
function cleanup() {
globalThis.clearTimeout(timeout);
chrome.tabs.onUpdated.removeListener(handleUpdated);
signal.removeEventListener('abort', handleAbort);
}
function handleAbort() {
cleanup();
resolve(false);
}
function handleUpdated(updatedTabId: number, changeInfo: { status?: string }) {
if (updatedTabId === tabId && changeInfo.status === 'complete') {
globalThis.clearTimeout(timeout);
chrome.tabs.onUpdated.removeListener(handleUpdated);
resolve();
cleanup();
resolve(true);
}
}
chrome.tabs.onUpdated.addListener(handleUpdated);
signal.addEventListener('abort', handleAbort, { once: true });
});
}
/**
* 简单等待工具。
*/
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => {
globalThis.setTimeout(resolve, ms);
async function hasWindow(windowId: number): Promise<boolean> {
try {
await chrome.windows.get(windowId);
return true;
} catch {
return false;
}
}
function raceWithAbort<T>(promise: Promise<T>, signal: AbortSignal): Promise<T> {
return new Promise((resolve, reject) => {
if (signal.aborted) {
resolve({ ok: false, error: 'canceled' } as T);
return;
}
let isSettled = false;
function cleanup() {
signal.removeEventListener('abort', handleAbort);
}
function handleAbort() {
if (isSettled) {
return;
}
isSettled = true;
cleanup();
resolve({ ok: false, error: 'canceled' } as T);
}
signal.addEventListener('abort', handleAbort, { once: true });
promise.then(
(value) => {
if (isSettled) {
return;
}
isSettled = true;
cleanup();
resolve(value);
},
(error) => {
if (isSettled) {
return;
}
isSettled = true;
cleanup();
reject(error);
},
);
});
}
function sleep(ms: number, signal?: AbortSignal): Promise<boolean> {
return new Promise((resolve) => {
if (signal?.aborted) {
resolve(false);
return;
}
const timeout = globalThis.setTimeout(() => {
cleanup();
resolve(true);
}, ms);
function cleanup() {
globalThis.clearTimeout(timeout);
signal?.removeEventListener('abort', handleAbort);
}
function handleAbort() {
cleanup();
resolve(false);
}
signal?.addEventListener('abort', handleAbort, { once: true });
});
}

View File

@@ -1,5 +1,5 @@
import type { BackgroundCommand, BackgroundResponse, CrawlStateResponse } from '../types';
import { cancelCrawl, cancelCrawlWhenWindowRemoved, resumeCrawl, startCrawl } from './crawlTask';
import { cancelCrawl, cancelCrawlWhenWindowRemoved, cancelStaleCrawlWhenWindowMissing, resumeCrawl, startCrawl } from './crawlTask';
import { getCrawlTaskState } from './taskState';
/**
@@ -14,6 +14,7 @@ export async function handleInstalled(): Promise<void> {
*/
export async function handleStartup(): Promise<void> {
console.log('[background] startup');
await cancelStaleCrawlWhenWindowMissing();
}
/**
@@ -34,6 +35,7 @@ export async function handleBackgroundCommand(
case 'START_CRAWL':
return startCrawl(message.payload.platformId);
case 'GET_CRAWL_STATE':
await cancelStaleCrawlWhenWindowMissing();
return { ok: true, data: await getCrawlTaskState() };
case 'CANCEL_CRAWL':
return cancelCrawl();

View File

@@ -1,27 +1,21 @@
import type { CrawlTaskState } from '@/types';
// chrome.storage.local 中保存当前爬取任务状态的键名。
const CRAWL_TASK_STORAGE_KEY = 'crawlTaskState';
/**
* 从 chrome.storage.local 读取当前爬取任务状态。
*/
export async function getCrawlTaskState(): Promise<CrawlTaskState | null> {
const result = await chrome.storage.local.get(CRAWL_TASK_STORAGE_KEY);
const state = result[CRAWL_TASK_STORAGE_KEY];
return isCrawlTaskState(state) ? state : null;
}
/**
* 将最新爬取任务状态写入 chrome.storage.local供 popup 和 content script 同步读取。
*/
export async function setCrawlTaskState(state: CrawlTaskState): Promise<void> {
await chrome.storage.local.set({ [CRAWL_TASK_STORAGE_KEY]: state });
}
/**
* 读取任务状态后执行不可变更新,避免覆盖已取消或已替换的任务。
*/
export async function clearCrawlTaskState(): Promise<void> {
await chrome.storage.local.remove(CRAWL_TASK_STORAGE_KEY);
}
export async function updateCrawlTaskState(
taskId: string,
updater: (state: CrawlTaskState) => CrawlTaskState,
@@ -35,9 +29,6 @@ export async function updateCrawlTaskState(
await setCrawlTaskState(updater(state));
}
/**
* 粗略判断 storage 中读取到的值是否像一个爬取任务状态对象。
*/
function isCrawlTaskState(value: unknown): value is CrawlTaskState {
return typeof value === 'object' && value !== null && 'id' in value && 'steps' in value;
}