1
This commit is contained in:
@@ -1,9 +1,16 @@
|
||||
import { getPlatformById } from '@/config/platforms';
|
||||
import type { CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types';
|
||||
import { scrapeDomFields, type DomScrapeResult } from '../domScraper';
|
||||
import type { CrawlPauseInfo, CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types';
|
||||
import type { DomScrapeResult } from '../domScraper';
|
||||
import type { CrawlStateResponse } from '../types';
|
||||
import { getCrawlTaskState, setCrawlTaskState, updateCrawlTaskState } from './taskState';
|
||||
|
||||
interface PageRunnerResponse {
|
||||
ok: boolean;
|
||||
data?: DomScrapeResult | null;
|
||||
interrupt?: CrawlPauseInfo;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建新的爬取任务,打开目标平台窗口,并把初始时间轴状态写入 storage。
|
||||
*/
|
||||
@@ -86,6 +93,29 @@ export async function cancelCrawl(): Promise<CrawlStateResponse> {
|
||||
return { ok: true, data: canceledState };
|
||||
}
|
||||
|
||||
/**
|
||||
* 用户处理完登录、验证码或风控后,恢复当前暂停中的爬取任务。
|
||||
*/
|
||||
export async function resumeCrawl(): Promise<CrawlStateResponse> {
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (!state || state.status !== 'paused') {
|
||||
return { ok: true, data: state };
|
||||
}
|
||||
|
||||
const resumedState: CrawlTaskState = {
|
||||
...state,
|
||||
status: 'running',
|
||||
pause: undefined,
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === state.currentStepIndex ? { ...step, status: 'running', message: undefined } : step,
|
||||
),
|
||||
};
|
||||
|
||||
await setCrawlTaskState(resumedState);
|
||||
return { ok: true, data: resumedState };
|
||||
}
|
||||
|
||||
/**
|
||||
* 窗口关闭后,如果关闭的是爬取窗口,就把当前任务标记为取消。
|
||||
*/
|
||||
@@ -114,53 +144,81 @@ async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskSt
|
||||
}
|
||||
|
||||
try {
|
||||
const tabId = await getWindowActiveTabId(initialState.windowId);
|
||||
|
||||
for (let stepIndex = 0; stepIndex < platform.steps.length; stepIndex += 1) {
|
||||
const step = platform.steps[stepIndex];
|
||||
const currentState = await getCrawlTaskState();
|
||||
let shouldRetryStep = true;
|
||||
|
||||
if (currentState?.id !== initialState.id || currentState.status !== 'running') {
|
||||
return;
|
||||
}
|
||||
while (shouldRetryStep) {
|
||||
const currentState = await getCrawlTaskState();
|
||||
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
...state,
|
||||
currentStepIndex: stepIndex,
|
||||
status: 'running',
|
||||
steps: state.steps.map((item, index) => ({
|
||||
...item,
|
||||
status: index === stepIndex ? 'running' : item.status,
|
||||
message: index === stepIndex ? undefined : item.message,
|
||||
})),
|
||||
}));
|
||||
if (currentState?.id !== initialState.id || currentState.status === 'canceled') {
|
||||
return;
|
||||
}
|
||||
|
||||
await chrome.tabs.update(tabId, { url: step.url, active: true });
|
||||
await waitForTabLoaded(tabId);
|
||||
if (currentState.status === 'paused') {
|
||||
const resumed = await waitUntilResumed(initialState.id);
|
||||
|
||||
const isReady = await waitForStepReady(tabId, step);
|
||||
if (!resumed) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (!isReady) {
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
...state,
|
||||
status: 'failed',
|
||||
currentStepIndex: stepIndex,
|
||||
status: 'running',
|
||||
pause: undefined,
|
||||
steps: state.steps.map((item, index) => ({
|
||||
...item,
|
||||
status: index === stepIndex ? 'running' : item.status,
|
||||
message: index === stepIndex ? undefined : item.message,
|
||||
})),
|
||||
}));
|
||||
|
||||
const tabId = await getWindowActiveTabId(initialState.windowId);
|
||||
await chrome.tabs.update(tabId, { url: step.url, active: true });
|
||||
await waitForTabLoaded(tabId);
|
||||
|
||||
const response = await scrapeStepInContent(tabId, step);
|
||||
|
||||
if (response.interrupt) {
|
||||
await pauseForInterrupt(initialState.id, stepIndex, response.interrupt);
|
||||
const resumed = await waitUntilResumed(initialState.id);
|
||||
|
||||
if (!resumed) {
|
||||
return;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
const message = response.error ?? '页面抓取失败';
|
||||
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
...state,
|
||||
status: 'failed',
|
||||
currentStepIndex: stepIndex,
|
||||
steps: state.steps.map((item, index) =>
|
||||
index === stepIndex ? { ...item, status: 'failed', message } : item,
|
||||
),
|
||||
}));
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, response.data);
|
||||
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
...state,
|
||||
steps: state.steps.map((item, index) =>
|
||||
index === stepIndex ? { ...item, status: 'failed', message: '页面关键 DOM 未加载完成' } : item,
|
||||
index === stepIndex
|
||||
? { ...item, status: 'success', message: undefined, result: response.data }
|
||||
: item,
|
||||
),
|
||||
}));
|
||||
return;
|
||||
|
||||
shouldRetryStep = false;
|
||||
}
|
||||
|
||||
const data = await scrapeStepFields(tabId, step);
|
||||
console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, data);
|
||||
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
...state,
|
||||
steps: state.steps.map((item, index) =>
|
||||
index === stepIndex ? { ...item, status: 'success', message: undefined } : item,
|
||||
),
|
||||
}));
|
||||
}
|
||||
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
@@ -198,55 +256,90 @@ async function getWindowActiveTabId(windowId: number): Promise<number> {
|
||||
}
|
||||
|
||||
/**
|
||||
* 等待步骤配置中的 checkSelector 出现;第一次超时后刷新页面再重试一次。
|
||||
* 让 content script 直接在目标页面执行检查和抓取。
|
||||
*/
|
||||
async function waitForStepReady(tabId: number, step: PlatformStepConfig): Promise<boolean> {
|
||||
if (await waitForSelector(tabId, step.checkSelector, 5000)) {
|
||||
return true;
|
||||
async function scrapeStepInContent(tabId: number, step: PlatformStepConfig): Promise<PageRunnerResponse> {
|
||||
const startedAt = Date.now();
|
||||
|
||||
while (Date.now() - startedAt < 20000) {
|
||||
const response = await sendPageRunnerMessage(tabId, {
|
||||
action: 'SCRAPE_STEP',
|
||||
payload: {
|
||||
fields: step.fields,
|
||||
checkSelector: step.checkSelector,
|
||||
},
|
||||
});
|
||||
|
||||
if (response.ok || response.interrupt || !isPageRunnerNotReadyError(response.error)) {
|
||||
return response;
|
||||
}
|
||||
|
||||
await sleep(500);
|
||||
}
|
||||
|
||||
await chrome.tabs.reload(tabId);
|
||||
await waitForTabLoaded(tabId);
|
||||
|
||||
return waitForSelector(tabId, step.checkSelector, 5000);
|
||||
return { ok: false, error: '页面脚本未响应,请刷新扩展后重试' };
|
||||
}
|
||||
|
||||
/**
|
||||
* 在目标页面轮询检查指定 selector 是否存在。
|
||||
* 给目标页的 content script 发送页面执行消息。
|
||||
*/
|
||||
async function waitForSelector(tabId: number, selector: string, timeoutMs: number): Promise<boolean> {
|
||||
const startedAt = Date.now();
|
||||
async function sendPageRunnerMessage(tabId: number, message: unknown): Promise<PageRunnerResponse> {
|
||||
try {
|
||||
const response = await chrome.tabs.sendMessage(tabId, message);
|
||||
|
||||
while (Date.now() - startedAt < timeoutMs) {
|
||||
const results = await chrome.scripting.executeScript({
|
||||
target: { tabId },
|
||||
func: (targetSelector: string) => Boolean(document.querySelector(targetSelector)),
|
||||
args: [selector],
|
||||
});
|
||||
if (response && typeof response === 'object') {
|
||||
return response as PageRunnerResponse;
|
||||
}
|
||||
|
||||
if (Boolean(results[0]?.result)) {
|
||||
return { ok: false, error: '页面脚本返回为空' };
|
||||
} catch (error: unknown) {
|
||||
return { ok: false, error: error instanceof Error ? error.message : String(error) };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断错误是否只是 content script 尚未注入完成。
|
||||
*/
|
||||
function isPageRunnerNotReadyError(error?: string): boolean {
|
||||
if (!error) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return /receiving end does not exist|could not establish connection|no receiving end/i.test(error);
|
||||
}
|
||||
|
||||
/**
|
||||
* 因登录、验证码或页面异常暂停当前任务。
|
||||
*/
|
||||
async function pauseForInterrupt(taskId: string, stepIndex: number, interrupt: CrawlPauseInfo): Promise<void> {
|
||||
await updateCrawlTaskState(taskId, (state) => ({
|
||||
...state,
|
||||
status: 'paused',
|
||||
pause: interrupt,
|
||||
currentStepIndex: stepIndex,
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === stepIndex ? { ...step, status: 'running', message: interrupt.message } : step,
|
||||
),
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* 暂停后等待用户点继续或取消。
|
||||
*/
|
||||
async function waitUntilResumed(taskId: string): Promise<boolean> {
|
||||
while (true) {
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (!state || state.id !== taskId || state.status === 'canceled' || state.status === 'failed') {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (state.status === 'running') {
|
||||
return true;
|
||||
}
|
||||
|
||||
await new Promise((resolve) => {
|
||||
globalThis.setTimeout(resolve, 500);
|
||||
});
|
||||
await sleep(1000);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* 注入 domScraper 到目标页面,并根据当前 step.fields 提取页面数据。
|
||||
*/
|
||||
async function scrapeStepFields(tabId: number, step: PlatformStepConfig): Promise<DomScrapeResult | null> {
|
||||
const results = await chrome.scripting.executeScript({
|
||||
target: { tabId },
|
||||
func: scrapeDomFields,
|
||||
args: [step.fields],
|
||||
});
|
||||
|
||||
return results[0]?.result ?? null;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -257,7 +350,7 @@ function createCrawlWindow(url: string): Promise<chrome.windows.Window> {
|
||||
chrome.windows.create(
|
||||
{
|
||||
url,
|
||||
type: 'popup',
|
||||
type: 'normal',
|
||||
focused: true,
|
||||
width: 1280,
|
||||
height: 900,
|
||||
@@ -302,3 +395,12 @@ function waitForTabLoaded(tabId: number): Promise<void> {
|
||||
chrome.tabs.onUpdated.addListener(handleUpdated);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* 简单等待工具。
|
||||
*/
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => {
|
||||
globalThis.setTimeout(resolve, ms);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import type { BackgroundCommand, BackgroundResponse, CrawlStateResponse } from '../types';
|
||||
import { cancelCrawl, cancelCrawlWhenWindowRemoved, startCrawl } from './crawlTask';
|
||||
import { cancelCrawl, cancelCrawlWhenWindowRemoved, resumeCrawl, startCrawl } from './crawlTask';
|
||||
import { getCrawlTaskState } from './taskState';
|
||||
|
||||
/**
|
||||
@@ -37,6 +37,8 @@ export async function handleBackgroundCommand(
|
||||
return { ok: true, data: await getCrawlTaskState() };
|
||||
case 'CANCEL_CRAWL':
|
||||
return cancelCrawl();
|
||||
case 'RESUME_CRAWL':
|
||||
return resumeCrawl();
|
||||
default:
|
||||
return { ok: false, error: '未知的后台指令' };
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user