|
|
|
|
@@ -1,9 +1,16 @@
|
|
|
|
|
import { getPlatformById } from '@/config/platforms';
|
|
|
|
|
import type { CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types';
|
|
|
|
|
import { scrapeDomFields, type DomScrapeResult } from '../domScraper';
|
|
|
|
|
import type { CrawlPauseInfo, CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types';
|
|
|
|
|
import type { DomScrapeResult } from '../domScraper';
|
|
|
|
|
import type { CrawlStateResponse } from '../types';
|
|
|
|
|
import { getCrawlTaskState, setCrawlTaskState, updateCrawlTaskState } from './taskState';
|
|
|
|
|
|
|
|
|
|
interface PageRunnerResponse {
|
|
|
|
|
ok: boolean;
|
|
|
|
|
data?: DomScrapeResult | null;
|
|
|
|
|
interrupt?: CrawlPauseInfo;
|
|
|
|
|
error?: string;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 创建新的爬取任务,打开目标平台窗口,并把初始时间轴状态写入 storage。
|
|
|
|
|
*/
|
|
|
|
|
@@ -86,6 +93,29 @@ export async function cancelCrawl(): Promise<CrawlStateResponse> {
|
|
|
|
|
return { ok: true, data: canceledState };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 用户处理完登录、验证码或风控后,恢复当前暂停中的爬取任务。
|
|
|
|
|
*/
|
|
|
|
|
export async function resumeCrawl(): Promise<CrawlStateResponse> {
|
|
|
|
|
const state = await getCrawlTaskState();
|
|
|
|
|
|
|
|
|
|
if (!state || state.status !== 'paused') {
|
|
|
|
|
return { ok: true, data: state };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const resumedState: CrawlTaskState = {
|
|
|
|
|
...state,
|
|
|
|
|
status: 'running',
|
|
|
|
|
pause: undefined,
|
|
|
|
|
steps: state.steps.map((step, index) =>
|
|
|
|
|
index === state.currentStepIndex ? { ...step, status: 'running', message: undefined } : step,
|
|
|
|
|
),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
await setCrawlTaskState(resumedState);
|
|
|
|
|
return { ok: true, data: resumedState };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 窗口关闭后,如果关闭的是爬取窗口,就把当前任务标记为取消。
|
|
|
|
|
*/
|
|
|
|
|
@@ -114,53 +144,81 @@ async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskSt
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
const tabId = await getWindowActiveTabId(initialState.windowId);
|
|
|
|
|
|
|
|
|
|
for (let stepIndex = 0; stepIndex < platform.steps.length; stepIndex += 1) {
|
|
|
|
|
const step = platform.steps[stepIndex];
|
|
|
|
|
const currentState = await getCrawlTaskState();
|
|
|
|
|
let shouldRetryStep = true;
|
|
|
|
|
|
|
|
|
|
if (currentState?.id !== initialState.id || currentState.status !== 'running') {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
while (shouldRetryStep) {
|
|
|
|
|
const currentState = await getCrawlTaskState();
|
|
|
|
|
|
|
|
|
|
await updateCrawlTaskState(initialState.id, (state) => ({
|
|
|
|
|
...state,
|
|
|
|
|
currentStepIndex: stepIndex,
|
|
|
|
|
status: 'running',
|
|
|
|
|
steps: state.steps.map((item, index) => ({
|
|
|
|
|
...item,
|
|
|
|
|
status: index === stepIndex ? 'running' : item.status,
|
|
|
|
|
message: index === stepIndex ? undefined : item.message,
|
|
|
|
|
})),
|
|
|
|
|
}));
|
|
|
|
|
if (currentState?.id !== initialState.id || currentState.status === 'canceled') {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
await chrome.tabs.update(tabId, { url: step.url, active: true });
|
|
|
|
|
await waitForTabLoaded(tabId);
|
|
|
|
|
if (currentState.status === 'paused') {
|
|
|
|
|
const resumed = await waitUntilResumed(initialState.id);
|
|
|
|
|
|
|
|
|
|
const isReady = await waitForStepReady(tabId, step);
|
|
|
|
|
if (!resumed) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!isReady) {
|
|
|
|
|
await updateCrawlTaskState(initialState.id, (state) => ({
|
|
|
|
|
...state,
|
|
|
|
|
status: 'failed',
|
|
|
|
|
currentStepIndex: stepIndex,
|
|
|
|
|
status: 'running',
|
|
|
|
|
pause: undefined,
|
|
|
|
|
steps: state.steps.map((item, index) => ({
|
|
|
|
|
...item,
|
|
|
|
|
status: index === stepIndex ? 'running' : item.status,
|
|
|
|
|
message: index === stepIndex ? undefined : item.message,
|
|
|
|
|
})),
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
const tabId = await getWindowActiveTabId(initialState.windowId);
|
|
|
|
|
await chrome.tabs.update(tabId, { url: step.url, active: true });
|
|
|
|
|
await waitForTabLoaded(tabId);
|
|
|
|
|
|
|
|
|
|
const response = await scrapeStepInContent(tabId, step);
|
|
|
|
|
|
|
|
|
|
if (response.interrupt) {
|
|
|
|
|
await pauseForInterrupt(initialState.id, stepIndex, response.interrupt);
|
|
|
|
|
const resumed = await waitUntilResumed(initialState.id);
|
|
|
|
|
|
|
|
|
|
if (!resumed) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!response.ok) {
|
|
|
|
|
const message = response.error ?? '页面抓取失败';
|
|
|
|
|
|
|
|
|
|
await updateCrawlTaskState(initialState.id, (state) => ({
|
|
|
|
|
...state,
|
|
|
|
|
status: 'failed',
|
|
|
|
|
currentStepIndex: stepIndex,
|
|
|
|
|
steps: state.steps.map((item, index) =>
|
|
|
|
|
index === stepIndex ? { ...item, status: 'failed', message } : item,
|
|
|
|
|
),
|
|
|
|
|
}));
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, response.data);
|
|
|
|
|
|
|
|
|
|
await updateCrawlTaskState(initialState.id, (state) => ({
|
|
|
|
|
...state,
|
|
|
|
|
steps: state.steps.map((item, index) =>
|
|
|
|
|
index === stepIndex ? { ...item, status: 'failed', message: '页面关键 DOM 未加载完成' } : item,
|
|
|
|
|
index === stepIndex
|
|
|
|
|
? { ...item, status: 'success', message: undefined, result: response.data }
|
|
|
|
|
: item,
|
|
|
|
|
),
|
|
|
|
|
}));
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
shouldRetryStep = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const data = await scrapeStepFields(tabId, step);
|
|
|
|
|
console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, data);
|
|
|
|
|
|
|
|
|
|
await updateCrawlTaskState(initialState.id, (state) => ({
|
|
|
|
|
...state,
|
|
|
|
|
steps: state.steps.map((item, index) =>
|
|
|
|
|
index === stepIndex ? { ...item, status: 'success', message: undefined } : item,
|
|
|
|
|
),
|
|
|
|
|
}));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
await updateCrawlTaskState(initialState.id, (state) => ({
|
|
|
|
|
@@ -198,55 +256,90 @@ async function getWindowActiveTabId(windowId: number): Promise<number> {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 等待步骤配置中的 checkSelector 出现;第一次超时后刷新页面再重试一次。
|
|
|
|
|
* 让 content script 直接在目标页面执行检查和抓取。
|
|
|
|
|
*/
|
|
|
|
|
async function waitForStepReady(tabId: number, step: PlatformStepConfig): Promise<boolean> {
|
|
|
|
|
if (await waitForSelector(tabId, step.checkSelector, 5000)) {
|
|
|
|
|
return true;
|
|
|
|
|
async function scrapeStepInContent(tabId: number, step: PlatformStepConfig): Promise<PageRunnerResponse> {
|
|
|
|
|
const startedAt = Date.now();
|
|
|
|
|
|
|
|
|
|
while (Date.now() - startedAt < 20000) {
|
|
|
|
|
const response = await sendPageRunnerMessage(tabId, {
|
|
|
|
|
action: 'SCRAPE_STEP',
|
|
|
|
|
payload: {
|
|
|
|
|
fields: step.fields,
|
|
|
|
|
checkSelector: step.checkSelector,
|
|
|
|
|
},
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
if (response.ok || response.interrupt || !isPageRunnerNotReadyError(response.error)) {
|
|
|
|
|
return response;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
await sleep(500);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
await chrome.tabs.reload(tabId);
|
|
|
|
|
await waitForTabLoaded(tabId);
|
|
|
|
|
|
|
|
|
|
return waitForSelector(tabId, step.checkSelector, 5000);
|
|
|
|
|
return { ok: false, error: '页面脚本未响应,请刷新扩展后重试' };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 在目标页面轮询检查指定 selector 是否存在。
|
|
|
|
|
* 给目标页的 content script 发送页面执行消息。
|
|
|
|
|
*/
|
|
|
|
|
async function waitForSelector(tabId: number, selector: string, timeoutMs: number): Promise<boolean> {
|
|
|
|
|
const startedAt = Date.now();
|
|
|
|
|
async function sendPageRunnerMessage(tabId: number, message: unknown): Promise<PageRunnerResponse> {
|
|
|
|
|
try {
|
|
|
|
|
const response = await chrome.tabs.sendMessage(tabId, message);
|
|
|
|
|
|
|
|
|
|
while (Date.now() - startedAt < timeoutMs) {
|
|
|
|
|
const results = await chrome.scripting.executeScript({
|
|
|
|
|
target: { tabId },
|
|
|
|
|
func: (targetSelector: string) => Boolean(document.querySelector(targetSelector)),
|
|
|
|
|
args: [selector],
|
|
|
|
|
});
|
|
|
|
|
if (response && typeof response === 'object') {
|
|
|
|
|
return response as PageRunnerResponse;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (Boolean(results[0]?.result)) {
|
|
|
|
|
return { ok: false, error: '页面脚本返回为空' };
|
|
|
|
|
} catch (error: unknown) {
|
|
|
|
|
return { ok: false, error: error instanceof Error ? error.message : String(error) };
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 判断错误是否只是 content script 尚未注入完成。
|
|
|
|
|
*/
|
|
|
|
|
function isPageRunnerNotReadyError(error?: string): boolean {
|
|
|
|
|
if (!error) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return /receiving end does not exist|could not establish connection|no receiving end/i.test(error);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 因登录、验证码或页面异常暂停当前任务。
|
|
|
|
|
*/
|
|
|
|
|
async function pauseForInterrupt(taskId: string, stepIndex: number, interrupt: CrawlPauseInfo): Promise<void> {
|
|
|
|
|
await updateCrawlTaskState(taskId, (state) => ({
|
|
|
|
|
...state,
|
|
|
|
|
status: 'paused',
|
|
|
|
|
pause: interrupt,
|
|
|
|
|
currentStepIndex: stepIndex,
|
|
|
|
|
steps: state.steps.map((step, index) =>
|
|
|
|
|
index === stepIndex ? { ...step, status: 'running', message: interrupt.message } : step,
|
|
|
|
|
),
|
|
|
|
|
}));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 暂停后等待用户点继续或取消。
|
|
|
|
|
*/
|
|
|
|
|
async function waitUntilResumed(taskId: string): Promise<boolean> {
|
|
|
|
|
while (true) {
|
|
|
|
|
const state = await getCrawlTaskState();
|
|
|
|
|
|
|
|
|
|
if (!state || state.id !== taskId || state.status === 'canceled' || state.status === 'failed') {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (state.status === 'running') {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
await new Promise((resolve) => {
|
|
|
|
|
globalThis.setTimeout(resolve, 500);
|
|
|
|
|
});
|
|
|
|
|
await sleep(1000);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 注入 domScraper 到目标页面,并根据当前 step.fields 提取页面数据。
|
|
|
|
|
*/
|
|
|
|
|
async function scrapeStepFields(tabId: number, step: PlatformStepConfig): Promise<DomScrapeResult | null> {
|
|
|
|
|
const results = await chrome.scripting.executeScript({
|
|
|
|
|
target: { tabId },
|
|
|
|
|
func: scrapeDomFields,
|
|
|
|
|
args: [step.fields],
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
return results[0]?.result ?? null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@@ -257,7 +350,7 @@ function createCrawlWindow(url: string): Promise<chrome.windows.Window> {
|
|
|
|
|
chrome.windows.create(
|
|
|
|
|
{
|
|
|
|
|
url,
|
|
|
|
|
type: 'popup',
|
|
|
|
|
type: 'normal',
|
|
|
|
|
focused: true,
|
|
|
|
|
width: 1280,
|
|
|
|
|
height: 900,
|
|
|
|
|
@@ -302,3 +395,12 @@ function waitForTabLoaded(tabId: number): Promise<void> {
|
|
|
|
|
chrome.tabs.onUpdated.addListener(handleUpdated);
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 简单等待工具。
|
|
|
|
|
*/
|
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
|
|
|
return new Promise((resolve) => {
|
|
|
|
|
globalThis.setTimeout(resolve, ms);
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|