This commit is contained in:
zhu
2026-05-06 14:04:05 +08:00
parent d78d70bde0
commit 40df507300
17 changed files with 691 additions and 163 deletions

View File

@@ -69,7 +69,7 @@ async function autoClick(config: PlatformFieldConfig, rootDom: ParentNode): Prom
/**
* 递归处理字段配置,支持普通字段、嵌套 row、列表和表格。
*/
async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode): Promise<DomScrapeResult> {
export async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode): Promise<DomScrapeResult> {
const result: DomScrapeResult = {};
for (const item of columns) {

View File

@@ -1,9 +1,16 @@
import { getPlatformById } from '@/config/platforms';
import type { CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types';
import { scrapeDomFields, type DomScrapeResult } from '../domScraper';
import type { CrawlPauseInfo, CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types';
import type { DomScrapeResult } from '../domScraper';
import type { CrawlStateResponse } from '../types';
import { getCrawlTaskState, setCrawlTaskState, updateCrawlTaskState } from './taskState';
interface PageRunnerResponse {
ok: boolean;
data?: DomScrapeResult | null;
interrupt?: CrawlPauseInfo;
error?: string;
}
/**
* 创建新的爬取任务,打开目标平台窗口,并把初始时间轴状态写入 storage。
*/
@@ -86,6 +93,29 @@ export async function cancelCrawl(): Promise<CrawlStateResponse> {
return { ok: true, data: canceledState };
}
/**
* 用户处理完登录、验证码或风控后,恢复当前暂停中的爬取任务。
*/
export async function resumeCrawl(): Promise<CrawlStateResponse> {
const state = await getCrawlTaskState();
if (!state || state.status !== 'paused') {
return { ok: true, data: state };
}
const resumedState: CrawlTaskState = {
...state,
status: 'running',
pause: undefined,
steps: state.steps.map((step, index) =>
index === state.currentStepIndex ? { ...step, status: 'running', message: undefined } : step,
),
};
await setCrawlTaskState(resumedState);
return { ok: true, data: resumedState };
}
/**
* 窗口关闭后,如果关闭的是爬取窗口,就把当前任务标记为取消。
*/
@@ -114,53 +144,81 @@ async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskSt
}
try {
const tabId = await getWindowActiveTabId(initialState.windowId);
for (let stepIndex = 0; stepIndex < platform.steps.length; stepIndex += 1) {
const step = platform.steps[stepIndex];
const currentState = await getCrawlTaskState();
let shouldRetryStep = true;
if (currentState?.id !== initialState.id || currentState.status !== 'running') {
return;
}
while (shouldRetryStep) {
const currentState = await getCrawlTaskState();
await updateCrawlTaskState(initialState.id, (state) => ({
...state,
currentStepIndex: stepIndex,
status: 'running',
steps: state.steps.map((item, index) => ({
...item,
status: index === stepIndex ? 'running' : item.status,
message: index === stepIndex ? undefined : item.message,
})),
}));
if (currentState?.id !== initialState.id || currentState.status === 'canceled') {
return;
}
await chrome.tabs.update(tabId, { url: step.url, active: true });
await waitForTabLoaded(tabId);
if (currentState.status === 'paused') {
const resumed = await waitUntilResumed(initialState.id);
const isReady = await waitForStepReady(tabId, step);
if (!resumed) {
return;
}
}
if (!isReady) {
await updateCrawlTaskState(initialState.id, (state) => ({
...state,
status: 'failed',
currentStepIndex: stepIndex,
status: 'running',
pause: undefined,
steps: state.steps.map((item, index) => ({
...item,
status: index === stepIndex ? 'running' : item.status,
message: index === stepIndex ? undefined : item.message,
})),
}));
const tabId = await getWindowActiveTabId(initialState.windowId);
await chrome.tabs.update(tabId, { url: step.url, active: true });
await waitForTabLoaded(tabId);
const response = await scrapeStepInContent(tabId, step);
if (response.interrupt) {
await pauseForInterrupt(initialState.id, stepIndex, response.interrupt);
const resumed = await waitUntilResumed(initialState.id);
if (!resumed) {
return;
}
continue;
}
if (!response.ok) {
const message = response.error ?? '页面抓取失败';
await updateCrawlTaskState(initialState.id, (state) => ({
...state,
status: 'failed',
currentStepIndex: stepIndex,
steps: state.steps.map((item, index) =>
index === stepIndex ? { ...item, status: 'failed', message } : item,
),
}));
return;
}
console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, response.data);
await updateCrawlTaskState(initialState.id, (state) => ({
...state,
steps: state.steps.map((item, index) =>
index === stepIndex ? { ...item, status: 'failed', message: '页面关键 DOM 未加载完成' } : item,
index === stepIndex
? { ...item, status: 'success', message: undefined, result: response.data }
: item,
),
}));
return;
shouldRetryStep = false;
}
const data = await scrapeStepFields(tabId, step);
console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, data);
await updateCrawlTaskState(initialState.id, (state) => ({
...state,
steps: state.steps.map((item, index) =>
index === stepIndex ? { ...item, status: 'success', message: undefined } : item,
),
}));
}
await updateCrawlTaskState(initialState.id, (state) => ({
@@ -198,55 +256,90 @@ async function getWindowActiveTabId(windowId: number): Promise<number> {
}
/**
* 等待步骤配置中的 checkSelector 出现;第一次超时后刷新页面再重试一次
* 让 content script 直接在目标页面执行检查和抓取
*/
async function waitForStepReady(tabId: number, step: PlatformStepConfig): Promise<boolean> {
if (await waitForSelector(tabId, step.checkSelector, 5000)) {
return true;
async function scrapeStepInContent(tabId: number, step: PlatformStepConfig): Promise<PageRunnerResponse> {
const startedAt = Date.now();
while (Date.now() - startedAt < 20000) {
const response = await sendPageRunnerMessage(tabId, {
action: 'SCRAPE_STEP',
payload: {
fields: step.fields,
checkSelector: step.checkSelector,
},
});
if (response.ok || response.interrupt || !isPageRunnerNotReadyError(response.error)) {
return response;
}
await sleep(500);
}
await chrome.tabs.reload(tabId);
await waitForTabLoaded(tabId);
return waitForSelector(tabId, step.checkSelector, 5000);
return { ok: false, error: '页面脚本未响应,请刷新扩展后重试' };
}
/**
* 目标页面轮询检查指定 selector 是否存在
* 目标页的 content script 发送页面执行消息
*/
async function waitForSelector(tabId: number, selector: string, timeoutMs: number): Promise<boolean> {
const startedAt = Date.now();
async function sendPageRunnerMessage(tabId: number, message: unknown): Promise<PageRunnerResponse> {
try {
const response = await chrome.tabs.sendMessage(tabId, message);
while (Date.now() - startedAt < timeoutMs) {
const results = await chrome.scripting.executeScript({
target: { tabId },
func: (targetSelector: string) => Boolean(document.querySelector(targetSelector)),
args: [selector],
});
if (response && typeof response === 'object') {
return response as PageRunnerResponse;
}
if (Boolean(results[0]?.result)) {
return { ok: false, error: '页面脚本返回为空' };
} catch (error: unknown) {
return { ok: false, error: error instanceof Error ? error.message : String(error) };
}
}
/**
* 判断错误是否只是 content script 尚未注入完成。
*/
function isPageRunnerNotReadyError(error?: string): boolean {
if (!error) {
return false;
}
return /receiving end does not exist|could not establish connection|no receiving end/i.test(error);
}
/**
* 因登录、验证码或页面异常暂停当前任务。
*/
async function pauseForInterrupt(taskId: string, stepIndex: number, interrupt: CrawlPauseInfo): Promise<void> {
await updateCrawlTaskState(taskId, (state) => ({
...state,
status: 'paused',
pause: interrupt,
currentStepIndex: stepIndex,
steps: state.steps.map((step, index) =>
index === stepIndex ? { ...step, status: 'running', message: interrupt.message } : step,
),
}));
}
/**
* 暂停后等待用户点继续或取消。
*/
async function waitUntilResumed(taskId: string): Promise<boolean> {
while (true) {
const state = await getCrawlTaskState();
if (!state || state.id !== taskId || state.status === 'canceled' || state.status === 'failed') {
return false;
}
if (state.status === 'running') {
return true;
}
await new Promise((resolve) => {
globalThis.setTimeout(resolve, 500);
});
await sleep(1000);
}
return false;
}
/**
* 注入 domScraper 到目标页面,并根据当前 step.fields 提取页面数据。
*/
async function scrapeStepFields(tabId: number, step: PlatformStepConfig): Promise<DomScrapeResult | null> {
const results = await chrome.scripting.executeScript({
target: { tabId },
func: scrapeDomFields,
args: [step.fields],
});
return results[0]?.result ?? null;
}
/**
@@ -257,7 +350,7 @@ function createCrawlWindow(url: string): Promise<chrome.windows.Window> {
chrome.windows.create(
{
url,
type: 'popup',
type: 'normal',
focused: true,
width: 1280,
height: 900,
@@ -302,3 +395,12 @@ function waitForTabLoaded(tabId: number): Promise<void> {
chrome.tabs.onUpdated.addListener(handleUpdated);
});
}
/**
* 简单等待工具。
*/
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => {
globalThis.setTimeout(resolve, ms);
});
}

View File

@@ -1,5 +1,5 @@
import type { BackgroundCommand, BackgroundResponse, CrawlStateResponse } from '../types';
import { cancelCrawl, cancelCrawlWhenWindowRemoved, startCrawl } from './crawlTask';
import { cancelCrawl, cancelCrawlWhenWindowRemoved, resumeCrawl, startCrawl } from './crawlTask';
import { getCrawlTaskState } from './taskState';
/**
@@ -37,6 +37,8 @@ export async function handleBackgroundCommand(
return { ok: true, data: await getCrawlTaskState() };
case 'CANCEL_CRAWL':
return cancelCrawl();
case 'RESUME_CRAWL':
return resumeCrawl();
default:
return { ok: false, error: '未知的后台指令' };
}

View File

@@ -23,8 +23,14 @@ export interface CancelCrawlCommand {
action: 'CANCEL_CRAWL';
}
// 继续当前暂停中的爬取任务。
export interface ResumeCrawlCommand {
// 消息动作类型:用户已处理登录/验证码,允许 background 继续重试当前步骤。
action: 'RESUME_CRAWL';
}
// popup/content script 能发送给 background 的全部消息类型。
export type BackgroundCommand = StartCrawlCommand | GetCrawlStateCommand | CancelCrawlCommand;
export type BackgroundCommand = StartCrawlCommand | GetCrawlStateCommand | CancelCrawlCommand | ResumeCrawlCommand;
// background 统一响应结构。
export interface BackgroundResponse<T = unknown> {