Compare commits
10 Commits
8b9985873a
...
186840ba23
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
186840ba23 | ||
|
|
30f9467cc8 | ||
|
|
6587b0c1d9 | ||
|
|
d6dc8a0db4 | ||
|
|
a22f1a42e4 | ||
|
|
7e2a83efd6 | ||
|
|
40df507300 | ||
|
|
d78d70bde0 | ||
|
|
53e4f0b2f4 | ||
|
|
350d4fc2e2 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -9,6 +9,7 @@ lerna-debug.log*
|
||||
|
||||
node_modules
|
||||
dist
|
||||
storeai-extension-v0.1.0
|
||||
dist-ssr
|
||||
*.local
|
||||
|
||||
|
||||
@@ -19,5 +19,3 @@
|
||||
7.在窗口中记得显示一个取消按钮,点击后关闭窗口,取消爬取
|
||||
|
||||
|
||||
# 具体代码实现流程
|
||||
请阅读./step.md文档,并严格按照步骤进行执行
|
||||
@@ -27,4 +27,9 @@ export default defineManifest({
|
||||
service_worker: 'src/background/index.ts',
|
||||
type: 'module',
|
||||
},
|
||||
externally_connectable: {
|
||||
matches: [
|
||||
"http://localhost:3000/*",
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
@@ -1,35 +1,37 @@
|
||||
import type { PlatformFieldConfig } from '@/types';
|
||||
import type {PlatformFieldConfig} from '@/types';
|
||||
|
||||
/** DOM 抓取后的通用结果结构。 */
|
||||
export type DomScrapeResult = Record<string, unknown>;
|
||||
|
||||
|
||||
/**
|
||||
* 在目标网页上下文中执行 DOM 抓取。
|
||||
*
|
||||
* 注意:该方法会通过 chrome.scripting.executeScript 注入到页面中执行,
|
||||
* 所以依赖的辅助方法都写在函数内部,避免注入后丢失模块作用域。
|
||||
* 等待重试机制
|
||||
*/
|
||||
export async function scrapeDomFields(fields: PlatformFieldConfig[]): Promise<DomScrapeResult | null> {
|
||||
if (!document.body) {
|
||||
return null;
|
||||
async function waitForElement(rootDom: ParentNode, selector: string) {
|
||||
let retryCount = 5;
|
||||
for (let i = 0; i < retryCount; i++) {
|
||||
const element = rootDom.querySelector(selector);
|
||||
if (element) {
|
||||
return element;
|
||||
}
|
||||
if (i < retryCount) {
|
||||
await sleep(500);
|
||||
}
|
||||
}
|
||||
|
||||
return processFields(fields, document.body);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
/** 睡眠工具,给点击、翻页、异步渲染留出等待时间。 */
|
||||
// 睡眠工具,给点击、翻页、异步渲染留出等待时间。
|
||||
const sleep = (ms?: number) => new Promise((resolve) => window.setTimeout(resolve, ms ?? 1500));
|
||||
|
||||
/** 从元素中提取实际值,默认取文本,也支持 attr、图片 src、链接 href。 */
|
||||
/**
|
||||
* 从元素中提取实际值,默认取文本,也支持 attr、图片 src、链接 href。
|
||||
*/
|
||||
function extractValue(el: Element | null, config: PlatformFieldConfig): string | null {
|
||||
if (!el) {
|
||||
return null;
|
||||
if (el == null) {
|
||||
return "未找到"
|
||||
}
|
||||
|
||||
if (config.attr) {
|
||||
return (el.getAttribute(config.attr) || '').trim();
|
||||
return (el.getAttribute(config.attr) || "").trim();
|
||||
}
|
||||
|
||||
const tagName = el.tagName.toUpperCase();
|
||||
@@ -46,30 +48,33 @@ function extractValue(el: Element | null, config: PlatformFieldConfig): string |
|
||||
return (el.textContent || '').replace(/\n/g, '').trim();
|
||||
}
|
||||
|
||||
/** 根据字段 condition 配置在指定 DOM 范围内自动点击目标元素。 */
|
||||
async function autoClick(config: PlatformFieldConfig, rootDom: ParentNode): Promise<void> {
|
||||
/**
|
||||
* 自动点击
|
||||
* 根据字段 condition 配置在指定 DOM 范围内自动点击目标元素。
|
||||
*/
|
||||
async function autoClick(config: PlatformFieldConfig, rootDom: Element): Promise<void> {
|
||||
if (!config.condition) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const selector of config.condition.list) {
|
||||
const targets = Array.from(rootDom.querySelectorAll<HTMLElement>(selector));
|
||||
|
||||
for (const condition of config.condition.list) {
|
||||
let targets: HTMLElement[] = Array.from(rootDom.querySelectorAll(condition))
|
||||
for (const target of targets) {
|
||||
target.click();
|
||||
await sleep(config.condition.time);
|
||||
await sleep(config?.condition.time);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** 递归处理字段配置,支持普通字段、嵌套 row、列表和表格。 */
|
||||
async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode): Promise<DomScrapeResult> {
|
||||
const result: DomScrapeResult = {};
|
||||
/**
|
||||
* 递归处理字段配置,支持普通字段、嵌套 row、列表和表格。
|
||||
*/
|
||||
export async function processFields(columns: PlatformFieldConfig[], rootDom: Element) {
|
||||
const result = {} as any;
|
||||
|
||||
for (const item of columns) {
|
||||
await autoClick(item, rootDom);
|
||||
|
||||
const element = rootDom.querySelector(item.className);
|
||||
const element = await waitForElement(rootDom, item.className)
|
||||
|
||||
if (!element) {
|
||||
result[item.label] = '没找到该元素';
|
||||
@@ -100,9 +105,13 @@ async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode
|
||||
return result;
|
||||
}
|
||||
|
||||
/** 按列表配置抓取所有列表项,并按分页配置继续翻页。 */
|
||||
async function processList(config: PlatformFieldConfig, rootDom: ParentNode): Promise<DomScrapeResult[]> {
|
||||
const allList: DomScrapeResult[] = [];
|
||||
/**
|
||||
* 提取列表的数据
|
||||
* @param config 配置
|
||||
* @param rootDom 父节点
|
||||
*/
|
||||
async function processList(config: PlatformFieldConfig, rootDom: ParentNode) {
|
||||
const allList = [];
|
||||
let pageCount = 0;
|
||||
|
||||
while (true) {
|
||||
@@ -148,55 +157,52 @@ async function processList(config: PlatformFieldConfig, rootDom: ParentNode): Pr
|
||||
return allList;
|
||||
}
|
||||
|
||||
/** 按表格配置抓取表格行数据,并按分页配置继续翻页。 */
|
||||
async function processTable(config: PlatformFieldConfig, rootDom: ParentNode): Promise<DomScrapeResult[]> {
|
||||
const allTableData: DomScrapeResult[] = [];
|
||||
/**
|
||||
* 按表格配置抓取表格行数据,并按分页配置继续翻页。
|
||||
*/
|
||||
async function processTable(config: PlatformFieldConfig, rootDom: ParentNode) {
|
||||
const allTableData: any[] = [];
|
||||
let pageCount = 0;
|
||||
|
||||
while (true) {
|
||||
pageCount += 1;
|
||||
|
||||
const partsNodes: Record<string, Element[]> = {};
|
||||
const partsNodes: any = {};
|
||||
|
||||
for (const part of config.tableParts ?? []) {
|
||||
const partKey = part.name ?? part.label;
|
||||
const partSelector = part.select ?? part.className;
|
||||
const rowSelector = part.rowSelector ?? `${partSelector} tr`;
|
||||
partsNodes[partKey] = Array.from(rootDom.querySelectorAll(rowSelector));
|
||||
}
|
||||
config.tableParts!.forEach(part => {
|
||||
partsNodes[part.name as any] = rootDom.querySelectorAll(`${part.select} tr`);
|
||||
});
|
||||
|
||||
const firstPart = config.tableParts?.[0];
|
||||
const firstPartKey = firstPart ? firstPart.name ?? firstPart.label : '';
|
||||
const rowCount = partsNodes[firstPartKey]?.length || 0;
|
||||
// //以第一个part的行数为准,进行横向扫描
|
||||
const rowCount = partsNodes[config.tableParts![0].name!]?.length || 0
|
||||
|
||||
for (let index = 0; index < rowCount; index += 1) {
|
||||
const rowData: DomScrapeResult = {};
|
||||
|
||||
for (const keyItem of config.keys ?? []) {
|
||||
const partKey = keyItem.part ?? firstPartKey;
|
||||
const targetRowNode = partsNodes[partKey]?.[index];
|
||||
for (let i = 0; i < rowCount; i++) {
|
||||
const rowData: any = {};
|
||||
|
||||
if (!targetRowNode) {
|
||||
continue;
|
||||
}
|
||||
//遍历keys,根据part映射,取对应的里面找
|
||||
for (const keyItem of config.keys!) {
|
||||
const targetRowNode = partsNodes[keyItem.part!][i];
|
||||
|
||||
if (keyItem.keys) {
|
||||
rowData[keyItem.label] = await processFields(keyItem.keys, targetRowNode);
|
||||
} else {
|
||||
rowData[keyItem.label] = extractValue(targetRowNode.querySelector(keyItem.className), keyItem);
|
||||
if (targetRowNode) {
|
||||
//提取值
|
||||
if (keyItem.keys) {
|
||||
rowData[keyItem.label] = await processFields(keyItem.keys, targetRowNode)
|
||||
} else {
|
||||
rowData[keyItem.label] = extractValue(targetRowNode.querySelector(keyItem.className), keyItem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
allTableData.push(rowData);
|
||||
}
|
||||
|
||||
if (!config.pagination) {
|
||||
console.log('未配置分页信息,抓取单页后结束。');
|
||||
console.log("未配置分页信息,抓取单页后结束。");
|
||||
break;
|
||||
}
|
||||
|
||||
if (config.pagination.maxPage && pageCount >= config.pagination.maxPage) {
|
||||
console.log('已达到配置的最大页数,停止。');
|
||||
console.log("已达到配置的最大页数,停止。");
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -221,4 +227,4 @@ async function processTable(config: PlatformFieldConfig, rootDom: ParentNode): P
|
||||
}
|
||||
|
||||
return allTableData;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,32 +2,45 @@ import { handleBackgroundCommand, handleInstalled, handleStartup, handleWindowRe
|
||||
import type { BackgroundCommand } from './types';
|
||||
|
||||
chrome.runtime.onInstalled.addListener(() => {
|
||||
void handleInstalled();
|
||||
void handleInstalled();
|
||||
});
|
||||
|
||||
chrome.runtime.onStartup.addListener(() => {
|
||||
void handleStartup();
|
||||
void handleStartup();
|
||||
});
|
||||
|
||||
chrome.runtime.onMessage.addListener((message: BackgroundCommand, _sender, sendResponse) => {
|
||||
void handleBackgroundMessage(message, sendResponse);
|
||||
return true;
|
||||
void handleBackgroundMessage(message, sendResponse);
|
||||
return true;
|
||||
});
|
||||
|
||||
chrome.windows.onRemoved.addListener((windowId) => {
|
||||
void handleWindowRemoved(windowId);
|
||||
void handleWindowRemoved(windowId);
|
||||
});
|
||||
|
||||
/** 统一包装后台消息处理,确保异步错误能回给调用方。 */
|
||||
chrome.runtime.onMessageExternal.addListener((message, _sender, sendResponse) => {
|
||||
if (message.type === 'STORE_AI_PING') {
|
||||
sendResponse({
|
||||
success: true,
|
||||
version: chrome.runtime.getManifest().version,
|
||||
});
|
||||
}
|
||||
|
||||
return true;
|
||||
});
|
||||
|
||||
/**
|
||||
* Wrap background command handling so async errors can still be returned to callers.
|
||||
*/
|
||||
async function handleBackgroundMessage(
|
||||
message: BackgroundCommand,
|
||||
sendResponse: (response?: unknown) => void,
|
||||
message: BackgroundCommand,
|
||||
sendResponse: (response?: unknown) => void,
|
||||
) {
|
||||
try {
|
||||
const result = await handleBackgroundCommand(message);
|
||||
sendResponse(result);
|
||||
} catch (error: unknown) {
|
||||
const messageText = error instanceof Error ? error.message : 'Unknown error';
|
||||
sendResponse({ ok: false, error: messageText });
|
||||
}
|
||||
try {
|
||||
const result = await handleBackgroundCommand(message);
|
||||
sendResponse(result);
|
||||
} catch (error: unknown) {
|
||||
const messageText = error instanceof Error ? error.message : 'Unknown error';
|
||||
sendResponse({ ok: false, data: null, error: messageText });
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,410 +1 @@
|
||||
import { getPlatformById } from '@/config/platforms';
|
||||
import type { CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types';
|
||||
import { scrapeDomFields, type DomScrapeResult } from './domScraper';
|
||||
import type { BackgroundCommand, BackgroundResponse, CrawlStateResponse } from './types';
|
||||
|
||||
/** chrome.storage.local 中保存当前爬取任务状态的键名。 */
|
||||
const CRAWL_TASK_STORAGE_KEY = 'crawlTaskState';
|
||||
|
||||
/** 扩展安装完成时的初始化入口,当前仅保留日志方便调试生命周期。 */
|
||||
export async function handleInstalled(): Promise<void> {
|
||||
console.log('[background] installed');
|
||||
}
|
||||
|
||||
/** 浏览器启动并加载扩展时的初始化入口,当前仅保留日志方便调试生命周期。 */
|
||||
export async function handleStartup(): Promise<void> {
|
||||
console.log('[background] startup');
|
||||
}
|
||||
|
||||
/** 监听窗口关闭事件;如果关闭的是爬取窗口,就把当前任务标记为取消。 */
|
||||
export async function handleWindowRemoved(windowId: number): Promise<void> {
|
||||
console.log('[background] window removed', windowId);
|
||||
|
||||
/** 当前保存的爬取任务状态。 */
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (state?.windowId === windowId && state.status === 'running') {
|
||||
await setCrawlTaskState({
|
||||
...state,
|
||||
status: 'canceled',
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === state.currentStepIndex ? { ...step, status: 'failed', message: '爬取窗口已关闭' } : step,
|
||||
),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/** 根据 popup/content 发来的 action 分发到对应的后台处理函数。 */
|
||||
export async function handleBackgroundCommand(
|
||||
message: BackgroundCommand,
|
||||
): Promise<BackgroundResponse | CrawlStateResponse> {
|
||||
switch (message.action) {
|
||||
case 'START_CRAWL':
|
||||
return startCrawl(message.payload.platformId);
|
||||
case 'GET_CRAWL_STATE':
|
||||
return { ok: true, data: await getCrawlTaskState() };
|
||||
case 'CANCEL_CRAWL':
|
||||
return cancelCrawl();
|
||||
default:
|
||||
return { ok: false, error: '未知的后台指令' };
|
||||
}
|
||||
}
|
||||
|
||||
/** 创建新的爬取任务,打开目标平台窗口,并把初始时间轴状态写入 storage。 */
|
||||
async function startCrawl(platformId: string): Promise<CrawlStateResponse> {
|
||||
/** 根据平台 ID 找到对应的平台爬取配置。 */
|
||||
const platform = getPlatformById(platformId);
|
||||
|
||||
if (!platform) {
|
||||
return { ok: false, error: '平台配置不存在' };
|
||||
}
|
||||
|
||||
/** 当前任务的开始时间戳,用于计算正计时。 */
|
||||
const startedAt = Date.now();
|
||||
/** 窗口创建前的初始任务状态,先写入 storage 让所有页面能立即感知爬取开始。 */
|
||||
const nextState: CrawlTaskState = {
|
||||
id: `${platform.id}-${startedAt}`,
|
||||
platformId: platform.id,
|
||||
platformName: platform.name,
|
||||
startedAt,
|
||||
status: 'running',
|
||||
currentStepIndex: 0,
|
||||
steps: platform.steps.map<CrawlProgressStep>((step, index) => ({
|
||||
name: step.name,
|
||||
uniqueKey: step.uniqueKey,
|
||||
status: index === 0 ? 'running' : 'pending',
|
||||
})),
|
||||
};
|
||||
|
||||
await setCrawlTaskState(nextState);
|
||||
|
||||
try {
|
||||
/** background 创建出来的目标平台窗口信息。 */
|
||||
const windowInfo = await createCrawlWindow(platform.baseUrl);
|
||||
/** 补充 windowId 后的任务状态,后续可用于取消或监听窗口关闭。 */
|
||||
const stateWithWindow = { ...nextState, windowId: windowInfo.id };
|
||||
await setCrawlTaskState(stateWithWindow);
|
||||
void runCrawlSteps(platform, stateWithWindow);
|
||||
return { ok: true, data: stateWithWindow };
|
||||
} catch (error: unknown) {
|
||||
/** 窗口创建失败时写入的失败状态,供 popup/content 显示错误进度。 */
|
||||
const failedState: CrawlTaskState = {
|
||||
...nextState,
|
||||
status: 'failed',
|
||||
steps: nextState.steps.map((step, index) =>
|
||||
index === 0 ? { ...step, status: 'failed', message: '打开平台窗口失败' } : step,
|
||||
),
|
||||
};
|
||||
await setCrawlTaskState(failedState);
|
||||
return { ok: false, data: failedState, error: error instanceof Error ? error.message : '打开平台窗口失败' };
|
||||
}
|
||||
}
|
||||
|
||||
/** 按平台 steps 顺序执行页面跳转、DOM 等待、字段抓取和进度更新。 */
|
||||
async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskState): Promise<void> {
|
||||
if (!initialState.windowId) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
/** 新窗口中的目标标签页 ID,后续所有跳转和脚本注入都依赖它。 */
|
||||
const tabId = await getWindowActiveTabId(initialState.windowId);
|
||||
|
||||
for (let stepIndex = 0; stepIndex < platform.steps.length; stepIndex += 1) {
|
||||
/** 当前正在执行的平台页面步骤配置。 */
|
||||
const step = platform.steps[stepIndex];
|
||||
|
||||
if (!(await isTaskRunning(initialState.id))) {
|
||||
return;
|
||||
}
|
||||
|
||||
await markStepRunning(initialState.id, stepIndex);
|
||||
await openStepPage(tabId, step.url);
|
||||
|
||||
/** 当前页面核心 DOM 是否已经出现。 */
|
||||
const isReady = await waitForStepReady(tabId, step);
|
||||
|
||||
if (!isReady) {
|
||||
await markStepFailed(initialState.id, stepIndex, '页面关键 DOM 未加载完成');
|
||||
await markTaskFailed(initialState.id);
|
||||
return;
|
||||
}
|
||||
|
||||
/** 注入页面执行后的字段抓取结果。 */
|
||||
const data = await scrapeStepFields(tabId, step);
|
||||
console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, data);
|
||||
await markStepSuccess(initialState.id, stepIndex);
|
||||
}
|
||||
|
||||
await markTaskCompleted(initialState.id);
|
||||
} catch (error: unknown) {
|
||||
console.error('[crawl] 执行失败', error);
|
||||
await markTaskFailed(initialState.id, error instanceof Error ? error.message : '爬取执行失败');
|
||||
}
|
||||
}
|
||||
|
||||
/** 获取指定窗口中的活动 tab ID。 */
|
||||
async function getWindowActiveTabId(windowId: number): Promise<number> {
|
||||
/** 指定窗口中查询到的标签页列表。 */
|
||||
const tabs = await chrome.tabs.query({ windowId, active: true });
|
||||
/** 当前窗口里用于承载爬取页面的活动标签页。 */
|
||||
const tab = tabs[0];
|
||||
|
||||
if (!tab?.id) {
|
||||
throw new Error('未找到爬取窗口中的标签页');
|
||||
}
|
||||
|
||||
return tab.id;
|
||||
}
|
||||
|
||||
/** 打开某个 steps 页面,并等待浏览器报告 tab 加载完成。 */
|
||||
async function openStepPage(tabId: number, url: string): Promise<void> {
|
||||
await chrome.tabs.update(tabId, { url, active: true });
|
||||
await waitForTabLoaded(tabId);
|
||||
}
|
||||
|
||||
/** 等待 tab 完成页面加载。 */
|
||||
function waitForTabLoaded(tabId: number): Promise<void> {
|
||||
return new Promise((resolve) => {
|
||||
/** 页面加载兜底定时器,避免某些站点不触发 complete 时流程永久挂起。 */
|
||||
const timeout = globalThis.setTimeout(() => {
|
||||
chrome.tabs.onUpdated.removeListener(handleUpdated);
|
||||
resolve();
|
||||
}, 15000);
|
||||
|
||||
/** chrome.tabs.onUpdated 的监听器,用于捕获指定 tab 的 complete 状态。 */
|
||||
function handleUpdated(updatedTabId: number, changeInfo: { status?: string }) {
|
||||
if (updatedTabId === tabId && changeInfo.status === 'complete') {
|
||||
globalThis.clearTimeout(timeout);
|
||||
chrome.tabs.onUpdated.removeListener(handleUpdated);
|
||||
resolve();
|
||||
}
|
||||
}
|
||||
|
||||
chrome.tabs.onUpdated.addListener(handleUpdated);
|
||||
});
|
||||
}
|
||||
|
||||
/** 等待步骤配置中的 checkSelector 出现;第一次超时后刷新页面再重试一次。 */
|
||||
async function waitForStepReady(tabId: number, step: PlatformStepConfig): Promise<boolean> {
|
||||
if (await waitForSelector(tabId, step.checkSelector, 5000)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
await chrome.tabs.reload(tabId);
|
||||
await waitForTabLoaded(tabId);
|
||||
return waitForSelector(tabId, step.checkSelector, 5000);
|
||||
}
|
||||
|
||||
/** 在目标页面轮询检查指定 selector 是否存在。 */
|
||||
async function waitForSelector(tabId: number, selector: string, timeoutMs: number): Promise<boolean> {
|
||||
/** 轮询开始时间,用于控制最大等待时长。 */
|
||||
const startedAt = Date.now();
|
||||
|
||||
while (Date.now() - startedAt < timeoutMs) {
|
||||
/** 当前页面是否已经能查询到目标元素。 */
|
||||
const exists = await checkSelectorExists(tabId, selector);
|
||||
|
||||
if (exists) {
|
||||
return true;
|
||||
}
|
||||
|
||||
await sleep(500);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/** 注入轻量脚本检查页面里是否存在指定 selector。 */
|
||||
async function checkSelectorExists(tabId: number, selector: string): Promise<boolean> {
|
||||
/** chrome.scripting.executeScript 返回的注入执行结果。 */
|
||||
const results = await chrome.scripting.executeScript({
|
||||
target: { tabId },
|
||||
func: (targetSelector: string) => Boolean(document.querySelector(targetSelector)),
|
||||
args: [selector],
|
||||
});
|
||||
|
||||
return Boolean(results[0]?.result);
|
||||
}
|
||||
|
||||
/** 注入 domScraper 到目标页面,并根据当前 step.fields 提取页面数据。 */
|
||||
async function scrapeStepFields(tabId: number, step: PlatformStepConfig): Promise<DomScrapeResult | null> {
|
||||
/** 目标页面执行 DOM 抓取后返回的结果数组。 */
|
||||
const results = await chrome.scripting.executeScript({
|
||||
target: { tabId },
|
||||
func: scrapeDomFields,
|
||||
args: [step.fields],
|
||||
});
|
||||
|
||||
return results[0]?.result ?? null;
|
||||
}
|
||||
|
||||
/** 判断指定任务是否仍处于 running 状态。 */
|
||||
async function isTaskRunning(taskId: string): Promise<boolean> {
|
||||
/** 当前 storage 中的任务状态。 */
|
||||
const state = await getCrawlTaskState();
|
||||
return state?.id === taskId && state.status === 'running';
|
||||
}
|
||||
|
||||
/** 将指定步骤标记为运行中,同时把其它未完成步骤保持为等待。 */
|
||||
async function markStepRunning(taskId: string, stepIndex: number): Promise<void> {
|
||||
await updateCrawlTaskState(taskId, (state) => ({
|
||||
...state,
|
||||
currentStepIndex: stepIndex,
|
||||
status: 'running',
|
||||
steps: state.steps.map((step, index) => ({
|
||||
...step,
|
||||
status: index === stepIndex ? 'running' : step.status,
|
||||
message: index === stepIndex ? undefined : step.message,
|
||||
})),
|
||||
}));
|
||||
}
|
||||
|
||||
/** 将指定步骤标记为成功。 */
|
||||
async function markStepSuccess(taskId: string, stepIndex: number): Promise<void> {
|
||||
await updateCrawlTaskState(taskId, (state) => ({
|
||||
...state,
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === stepIndex ? { ...step, status: 'success', message: undefined } : step,
|
||||
),
|
||||
}));
|
||||
}
|
||||
|
||||
/** 将指定步骤标记为失败,并记录失败原因。 */
|
||||
async function markStepFailed(taskId: string, stepIndex: number, message: string): Promise<void> {
|
||||
await updateCrawlTaskState(taskId, (state) => ({
|
||||
...state,
|
||||
currentStepIndex: stepIndex,
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === stepIndex ? { ...step, status: 'failed', message } : step,
|
||||
),
|
||||
}));
|
||||
}
|
||||
|
||||
/** 将整个任务标记为完成。 */
|
||||
async function markTaskCompleted(taskId: string): Promise<void> {
|
||||
await updateCrawlTaskState(taskId, (state) => ({
|
||||
...state,
|
||||
status: 'completed',
|
||||
steps: state.steps.map((step) => (step.status === 'running' ? { ...step, status: 'success' } : step)),
|
||||
}));
|
||||
}
|
||||
|
||||
/** 将整个任务标记为失败。 */
|
||||
async function markTaskFailed(taskId: string, message = '爬取失败'): Promise<void> {
|
||||
await updateCrawlTaskState(taskId, (state) => ({
|
||||
...state,
|
||||
status: 'failed',
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === state.currentStepIndex && step.status === 'running' ? { ...step, status: 'failed', message } : step,
|
||||
),
|
||||
}));
|
||||
}
|
||||
|
||||
/** 读取任务状态后执行不可变更新,避免覆盖已取消或已替换的任务。 */
|
||||
async function updateCrawlTaskState(
|
||||
taskId: string,
|
||||
updater: (state: CrawlTaskState) => CrawlTaskState,
|
||||
): Promise<void> {
|
||||
/** 当前 storage 中最新的任务状态。 */
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (!state || state.id !== taskId || state.status === 'canceled') {
|
||||
return;
|
||||
}
|
||||
|
||||
await setCrawlTaskState(updater(state));
|
||||
}
|
||||
|
||||
/** 睡眠工具,用于轮询 DOM 等待。 */
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => {
|
||||
globalThis.setTimeout(resolve, ms);
|
||||
});
|
||||
}
|
||||
|
||||
/** 取消当前爬取任务,并尝试关闭正在爬取的平台窗口。 */
|
||||
async function cancelCrawl(): Promise<CrawlStateResponse> {
|
||||
/** 当前保存的爬取任务状态。 */
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (!state) {
|
||||
return { ok: true, data: null };
|
||||
}
|
||||
|
||||
/** 用户取消后的任务状态,当前执行步骤会显示为失败并附带取消原因。 */
|
||||
const canceledState: CrawlTaskState = {
|
||||
...state,
|
||||
status: 'canceled',
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === state.currentStepIndex ? { ...step, status: 'failed', message: '用户已取消' } : step,
|
||||
),
|
||||
};
|
||||
|
||||
await setCrawlTaskState(canceledState);
|
||||
|
||||
if (state.windowId) {
|
||||
await removeWindow(state.windowId);
|
||||
}
|
||||
|
||||
return { ok: true, data: canceledState };
|
||||
}
|
||||
|
||||
/** 从 chrome.storage.local 读取当前爬取任务状态。 */
|
||||
async function getCrawlTaskState(): Promise<CrawlTaskState | null> {
|
||||
/** chrome.storage.local 返回的原始键值对象。 */
|
||||
const result = await chrome.storage.local.get(CRAWL_TASK_STORAGE_KEY);
|
||||
/** 取出的任务状态候选值,需要经过结构校验后才能使用。 */
|
||||
const state = result[CRAWL_TASK_STORAGE_KEY];
|
||||
return isCrawlTaskState(state) ? state : null;
|
||||
}
|
||||
|
||||
/** 将最新爬取任务状态写入 chrome.storage.local,供 popup 和 content script 同步读取。 */
|
||||
async function setCrawlTaskState(state: CrawlTaskState): Promise<void> {
|
||||
await chrome.storage.local.set({ [CRAWL_TASK_STORAGE_KEY]: state });
|
||||
}
|
||||
|
||||
/** 打开一个普通浏览器窗口承载目标平台页面。 */
|
||||
function createCrawlWindow(url: string): Promise<chrome.windows.Window> {
|
||||
return new Promise((resolve, reject) => {
|
||||
chrome.windows.create(
|
||||
{
|
||||
url,
|
||||
type: 'normal',
|
||||
focused: true,
|
||||
width: 1280,
|
||||
height: 900,
|
||||
},
|
||||
(windowInfo) => {
|
||||
/** Chrome 扩展 API 回调中的运行时错误。 */
|
||||
const runtimeError = chrome.runtime.lastError;
|
||||
|
||||
if (runtimeError) {
|
||||
reject(new Error(runtimeError.message));
|
||||
return;
|
||||
}
|
||||
|
||||
if (!windowInfo?.id) {
|
||||
reject(new Error('窗口创建失败'));
|
||||
return;
|
||||
}
|
||||
|
||||
resolve(windowInfo);
|
||||
},
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
/** 根据窗口 ID 关闭爬取窗口;关闭失败时不阻塞取消状态写入。 */
|
||||
function removeWindow(windowId: number): Promise<void> {
|
||||
return new Promise((resolve) => {
|
||||
chrome.windows.remove(windowId, () => {
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/** 粗略判断 storage 中读取到的值是否像一个爬取任务状态对象。 */
|
||||
function isCrawlTaskState(value: unknown): value is CrawlTaskState {
|
||||
return typeof value === 'object' && value !== null && 'id' in value && 'steps' in value;
|
||||
}
|
||||
export { handleBackgroundCommand, handleInstalled, handleStartup, handleWindowRemoved } from './service/lifecycle';
|
||||
|
||||
561
src/background/service/crawlTask.ts
Normal file
561
src/background/service/crawlTask.ts
Normal file
@@ -0,0 +1,561 @@
|
||||
import { getPlatformById } from '@/config/platforms';
|
||||
import type { CrawlPauseInfo, CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types';
|
||||
import type { DomScrapeResult } from '../domScraper';
|
||||
import type { CrawlStateResponse } from '../types';
|
||||
import { clearCrawlTaskState, getCrawlTaskState, setCrawlTaskState, updateCrawlTaskState } from './taskState';
|
||||
|
||||
interface PageRunnerResponse {
|
||||
ok: boolean;
|
||||
data?: DomScrapeResult | null;
|
||||
interrupt?: CrawlPauseInfo;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
const activeCrawlControllers = new Map<string, AbortController>();
|
||||
|
||||
/**
|
||||
* 创建新的爬取任务,打开目标平台窗口,并把初始时间轴状态写入 storage。
|
||||
*/
|
||||
export async function startCrawl(platformId: string): Promise<CrawlStateResponse> {
|
||||
const platform = getPlatformById(platformId);
|
||||
|
||||
if (!platform) {
|
||||
return { ok: false, error: '平台配置不存在' };
|
||||
}
|
||||
|
||||
const firstStep = platform.steps[0];
|
||||
|
||||
if (!firstStep) {
|
||||
return { ok: false, error: '平台未配置爬取步骤' };
|
||||
}
|
||||
|
||||
const startedAt = Date.now();
|
||||
const nextState: CrawlTaskState = {
|
||||
id: `${platform.id}-${startedAt}`,
|
||||
platformId: platform.id,
|
||||
platformName: platform.name,
|
||||
startedAt,
|
||||
status: 'running',
|
||||
currentStepIndex: 0,
|
||||
steps: platform.steps.map<CrawlProgressStep>((step, index) => ({
|
||||
name: step.name,
|
||||
uniqueKey: step.uniqueKey,
|
||||
status: index === 0 ? 'running' : 'pending',
|
||||
})),
|
||||
};
|
||||
|
||||
await setCrawlTaskState(nextState);
|
||||
|
||||
try {
|
||||
const windowInfo = await createCrawlWindow(firstStep.url);
|
||||
const stateWithWindow = { ...nextState, windowId: windowInfo.id };
|
||||
const controller = new AbortController();
|
||||
|
||||
await setCrawlTaskState(stateWithWindow);
|
||||
activeCrawlControllers.set(stateWithWindow.id, controller);
|
||||
void runCrawlSteps(platform, stateWithWindow, controller.signal).finally(() => {
|
||||
activeCrawlControllers.delete(stateWithWindow.id);
|
||||
});
|
||||
|
||||
return { ok: true, data: stateWithWindow };
|
||||
} catch (error: unknown) {
|
||||
const failedState: CrawlTaskState = {
|
||||
...nextState,
|
||||
status: 'failed',
|
||||
steps: nextState.steps.map((step, index) =>
|
||||
index === 0 ? { ...step, status: 'failed', message: '打开平台窗口失败' } : step,
|
||||
),
|
||||
};
|
||||
|
||||
await setCrawlTaskState(failedState);
|
||||
return { ok: false, data: failedState, error: error instanceof Error ? error.message : '打开平台窗口失败' };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 取消当前爬取任务,并尝试关闭正在爬取的平台窗口。
|
||||
*/
|
||||
export async function cancelCrawl(): Promise<CrawlStateResponse> {
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (!state) {
|
||||
return { ok: true, data: null };
|
||||
}
|
||||
|
||||
abortActiveCrawl(state.id);
|
||||
|
||||
await clearCrawlTaskState();
|
||||
|
||||
if (state.windowId) {
|
||||
await chrome.windows.remove(state.windowId).catch(() => undefined);
|
||||
}
|
||||
|
||||
return { ok: true, data: null };
|
||||
}
|
||||
|
||||
/**
|
||||
* 用户处理完登录、验证码或风控后,恢复当前暂停中的爬取任务。
|
||||
*/
|
||||
export async function resumeCrawl(): Promise<CrawlStateResponse> {
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (!state || state.status !== 'paused') {
|
||||
return { ok: true, data: state };
|
||||
}
|
||||
|
||||
const resumedState: CrawlTaskState = {
|
||||
...state,
|
||||
status: 'running',
|
||||
pause: undefined,
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === state.currentStepIndex ? { ...step, status: 'running', message: undefined } : step,
|
||||
),
|
||||
};
|
||||
|
||||
await setCrawlTaskState(resumedState);
|
||||
return { ok: true, data: resumedState };
|
||||
}
|
||||
|
||||
/**
|
||||
* 窗口关闭后,如果关闭的是爬取窗口,就把当前任务标记为取消。
|
||||
*/
|
||||
export async function cancelCrawlWhenWindowRemoved(windowId: number): Promise<void> {
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (state?.windowId !== windowId || !['running', 'paused'].includes(state.status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
abortActiveCrawl(state.id);
|
||||
|
||||
await setCrawlTaskState({
|
||||
...state,
|
||||
status: 'canceled',
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === state.currentStepIndex ? { ...step, status: 'failed', message: '爬取窗口已关闭' } : step,
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
export async function cancelStaleCrawlWhenWindowMissing(): Promise<void> {
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (!state || !['running', 'paused'].includes(state.status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const isWindowAlive = state.windowId ? await hasWindow(state.windowId) : false;
|
||||
|
||||
if (isWindowAlive) {
|
||||
return;
|
||||
}
|
||||
|
||||
abortActiveCrawl(state.id);
|
||||
|
||||
await setCrawlTaskState({
|
||||
...state,
|
||||
status: 'canceled',
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === state.currentStepIndex ? { ...step, status: 'failed', message: '爬取窗口已关闭,任务已取消' } : step,
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
function abortActiveCrawl(taskId: string): void {
|
||||
activeCrawlControllers.get(taskId)?.abort();
|
||||
}
|
||||
|
||||
/**
|
||||
* 按平台 steps 顺序执行页面跳转、DOM 等待、字段抓取和进度更新。
|
||||
*/
|
||||
async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskState, signal: AbortSignal): Promise<void> {
|
||||
if (!initialState.windowId) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
for (let stepIndex = 0; stepIndex < platform.steps.length; stepIndex += 1) {
|
||||
const step = platform.steps[stepIndex];
|
||||
let shouldRetryStep = true;
|
||||
|
||||
while (shouldRetryStep) {
|
||||
const currentState = await getCrawlTaskState();
|
||||
|
||||
if (signal.aborted || currentState?.id !== initialState.id || currentState.status === 'canceled') {
|
||||
return;
|
||||
}
|
||||
|
||||
if (currentState.status === 'paused') {
|
||||
const resumed = await waitUntilResumed(initialState.id, signal);
|
||||
|
||||
if (!resumed) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
...state,
|
||||
currentStepIndex: stepIndex,
|
||||
status: 'running',
|
||||
pause: undefined,
|
||||
steps: state.steps.map((item, index) => ({
|
||||
...item,
|
||||
status: index === stepIndex ? 'running' : item.status,
|
||||
message: index === stepIndex ? undefined : item.message,
|
||||
})),
|
||||
}));
|
||||
|
||||
const tabId = await getWindowActiveTabId(initialState.windowId);
|
||||
await chrome.tabs.update(tabId, { url: step.url, active: true });
|
||||
const tabLoaded = await waitForTabLoaded(tabId, signal);
|
||||
|
||||
if (!tabLoaded || signal.aborted) {
|
||||
return;
|
||||
}
|
||||
|
||||
const response = await scrapeStepInContent(tabId, step, signal);
|
||||
|
||||
if (signal.aborted) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (response.interrupt) {
|
||||
await pauseForInterrupt(initialState.id, stepIndex, response.interrupt);
|
||||
const resumed = await waitUntilResumed(initialState.id, signal);
|
||||
|
||||
if (!resumed) {
|
||||
return;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
const message = response.error ?? '页面抓取失败';
|
||||
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
...state,
|
||||
status: 'failed',
|
||||
currentStepIndex: stepIndex,
|
||||
steps: state.steps.map((item, index) =>
|
||||
index === stepIndex ? { ...item, status: 'failed', message } : item,
|
||||
),
|
||||
}));
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, response.data);
|
||||
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
...state,
|
||||
steps: state.steps.map((item, index) =>
|
||||
index === stepIndex
|
||||
? { ...item, status: 'success', message: undefined, result: response.data }
|
||||
: item,
|
||||
),
|
||||
}));
|
||||
|
||||
shouldRetryStep = false;
|
||||
}
|
||||
}
|
||||
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
...state,
|
||||
status: 'completed',
|
||||
steps: state.steps.map((step) => (step.status === 'running' ? { ...step, status: 'success' } : step)),
|
||||
}));
|
||||
} catch (error: unknown) {
|
||||
console.error('[crawl] 执行失败', error);
|
||||
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
...state,
|
||||
status: 'failed',
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === state.currentStepIndex && step.status === 'running'
|
||||
? { ...step, status: 'failed', message: error instanceof Error ? error.message : '爬取执行失败' }
|
||||
: step,
|
||||
),
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取指定窗口中的活动 tab ID。
|
||||
*/
|
||||
async function getWindowActiveTabId(windowId: number): Promise<number> {
|
||||
const tabs = await chrome.tabs.query({ windowId, active: true });
|
||||
const tab = tabs[0];
|
||||
|
||||
if (!tab?.id) {
|
||||
throw new Error('未找到爬取窗口中的标签页');
|
||||
}
|
||||
|
||||
return tab.id;
|
||||
}
|
||||
|
||||
/**
|
||||
* 让 content script 直接在目标页面执行检查和抓取。
|
||||
*/
|
||||
async function scrapeStepInContent(
|
||||
tabId: number,
|
||||
step: PlatformStepConfig,
|
||||
signal: AbortSignal,
|
||||
): Promise<PageRunnerResponse> {
|
||||
const startedAt = Date.now();
|
||||
|
||||
while (Date.now() - startedAt < 20000) {
|
||||
if (signal.aborted) {
|
||||
return { ok: false, error: 'canceled' };
|
||||
}
|
||||
|
||||
const response = await sendPageRunnerMessage(tabId, {
|
||||
action: 'SCRAPE_STEP',
|
||||
payload: {
|
||||
fields: step.fields,
|
||||
checkSelector: step.checkSelector,
|
||||
},
|
||||
}, signal);
|
||||
|
||||
if (response.ok || response.interrupt || !isPageRunnerNotReadyError(response.error)) {
|
||||
return response;
|
||||
}
|
||||
|
||||
if (!(await sleep(500, signal))) {
|
||||
return { ok: false, error: 'canceled' };
|
||||
}
|
||||
}
|
||||
|
||||
return { ok: false, error: '页面脚本未响应,请刷新扩展后重试' };
|
||||
}
|
||||
|
||||
/**
|
||||
* 给目标页的 content script 发送页面执行消息。
|
||||
*/
|
||||
async function sendPageRunnerMessage(tabId: number, message: unknown, signal: AbortSignal): Promise<PageRunnerResponse> {
|
||||
if (signal.aborted) {
|
||||
return { ok: false, error: 'canceled' };
|
||||
}
|
||||
|
||||
return raceWithAbort(sendPageRunnerMessageOnce(tabId, message), signal);
|
||||
}
|
||||
|
||||
async function sendPageRunnerMessageOnce(tabId: number, message: unknown): Promise<PageRunnerResponse> {
|
||||
try {
|
||||
const response = await chrome.tabs.sendMessage(tabId, message);
|
||||
|
||||
if (response && typeof response === 'object') {
|
||||
return response as PageRunnerResponse;
|
||||
}
|
||||
|
||||
return { ok: false, error: '页面脚本返回为空' };
|
||||
} catch (error: unknown) {
|
||||
return { ok: false, error: error instanceof Error ? error.message : String(error) };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断错误是否只是 content script 尚未注入完成。
|
||||
*/
|
||||
function isPageRunnerNotReadyError(error?: string): boolean {
|
||||
if (!error) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return /receiving end does not exist|could not establish connection|no receiving end/i.test(error);
|
||||
}
|
||||
|
||||
/**
|
||||
* 因登录、验证码或页面异常暂停当前任务。
|
||||
*/
|
||||
async function pauseForInterrupt(taskId: string, stepIndex: number, interrupt: CrawlPauseInfo): Promise<void> {
|
||||
await updateCrawlTaskState(taskId, (state) => ({
|
||||
...state,
|
||||
status: 'paused',
|
||||
pause: interrupt,
|
||||
currentStepIndex: stepIndex,
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === stepIndex ? { ...step, status: 'running', message: interrupt.message } : step,
|
||||
),
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* 暂停后等待用户点继续或取消。
|
||||
*/
|
||||
async function waitUntilResumed(taskId: string, signal: AbortSignal): Promise<boolean> {
|
||||
while (true) {
|
||||
if (signal.aborted) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (!state || state.id !== taskId || state.status === 'canceled' || state.status === 'failed') {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (state.status === 'running') {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!(await sleep(1000, signal))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 打开一个普通浏览器窗口承载目标平台页面。
|
||||
*/
|
||||
function createCrawlWindow(url: string): Promise<chrome.windows.Window> {
|
||||
return new Promise((resolve, reject) => {
|
||||
chrome.windows.create(
|
||||
{
|
||||
url,
|
||||
type: 'normal',
|
||||
focused: true,
|
||||
width: 1280,
|
||||
height: 900,
|
||||
},
|
||||
(windowInfo) => {
|
||||
const runtimeError = chrome.runtime.lastError;
|
||||
|
||||
if (runtimeError) {
|
||||
reject(new Error(runtimeError.message));
|
||||
return;
|
||||
}
|
||||
|
||||
if (!windowInfo?.id) {
|
||||
reject(new Error('窗口创建失败'));
|
||||
return;
|
||||
}
|
||||
|
||||
resolve(windowInfo);
|
||||
},
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* 等待 tab 完成页面加载。
|
||||
*/
|
||||
function waitForTabLoaded(tabId: number, signal: AbortSignal): Promise<boolean> {
|
||||
return new Promise((resolve) => {
|
||||
if (signal.aborted) {
|
||||
resolve(false);
|
||||
return;
|
||||
}
|
||||
|
||||
const timeout = globalThis.setTimeout(() => {
|
||||
cleanup();
|
||||
resolve(true);
|
||||
}, 15000);
|
||||
|
||||
function cleanup() {
|
||||
globalThis.clearTimeout(timeout);
|
||||
chrome.tabs.onUpdated.removeListener(handleUpdated);
|
||||
signal.removeEventListener('abort', handleAbort);
|
||||
}
|
||||
|
||||
function handleAbort() {
|
||||
cleanup();
|
||||
resolve(false);
|
||||
}
|
||||
|
||||
function handleUpdated(updatedTabId: number, changeInfo: { status?: string }) {
|
||||
if (updatedTabId === tabId && changeInfo.status === 'complete') {
|
||||
cleanup();
|
||||
resolve(true);
|
||||
}
|
||||
}
|
||||
|
||||
chrome.tabs.onUpdated.addListener(handleUpdated);
|
||||
signal.addEventListener('abort', handleAbort, { once: true });
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* 简单等待工具。
|
||||
*/
|
||||
async function hasWindow(windowId: number): Promise<boolean> {
|
||||
try {
|
||||
await chrome.windows.get(windowId);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function raceWithAbort<T>(promise: Promise<T>, signal: AbortSignal): Promise<T> {
|
||||
return new Promise((resolve, reject) => {
|
||||
if (signal.aborted) {
|
||||
resolve({ ok: false, error: 'canceled' } as T);
|
||||
return;
|
||||
}
|
||||
|
||||
let isSettled = false;
|
||||
|
||||
function cleanup() {
|
||||
signal.removeEventListener('abort', handleAbort);
|
||||
}
|
||||
|
||||
function handleAbort() {
|
||||
if (isSettled) {
|
||||
return;
|
||||
}
|
||||
|
||||
isSettled = true;
|
||||
cleanup();
|
||||
resolve({ ok: false, error: 'canceled' } as T);
|
||||
}
|
||||
|
||||
signal.addEventListener('abort', handleAbort, { once: true });
|
||||
|
||||
promise.then(
|
||||
(value) => {
|
||||
if (isSettled) {
|
||||
return;
|
||||
}
|
||||
|
||||
isSettled = true;
|
||||
cleanup();
|
||||
resolve(value);
|
||||
},
|
||||
(error) => {
|
||||
if (isSettled) {
|
||||
return;
|
||||
}
|
||||
|
||||
isSettled = true;
|
||||
cleanup();
|
||||
reject(error);
|
||||
},
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
function sleep(ms: number, signal?: AbortSignal): Promise<boolean> {
|
||||
return new Promise((resolve) => {
|
||||
if (signal?.aborted) {
|
||||
resolve(false);
|
||||
return;
|
||||
}
|
||||
|
||||
const timeout = globalThis.setTimeout(() => {
|
||||
cleanup();
|
||||
resolve(true);
|
||||
}, ms);
|
||||
|
||||
function cleanup() {
|
||||
globalThis.clearTimeout(timeout);
|
||||
signal?.removeEventListener('abort', handleAbort);
|
||||
}
|
||||
|
||||
function handleAbort() {
|
||||
cleanup();
|
||||
resolve(false);
|
||||
}
|
||||
|
||||
signal?.addEventListener('abort', handleAbort, { once: true });
|
||||
});
|
||||
}
|
||||
47
src/background/service/lifecycle.ts
Normal file
47
src/background/service/lifecycle.ts
Normal file
@@ -0,0 +1,47 @@
|
||||
import type { BackgroundCommand, BackgroundResponse, CrawlStateResponse } from '../types';
|
||||
import { cancelCrawl, cancelCrawlWhenWindowRemoved, cancelStaleCrawlWhenWindowMissing, resumeCrawl, startCrawl } from './crawlTask';
|
||||
import { getCrawlTaskState } from './taskState';
|
||||
|
||||
/**
|
||||
* 扩展安装完成时的初始化入口,当前仅保留日志方便调试生命周期。
|
||||
*/
|
||||
export async function handleInstalled(): Promise<void> {
|
||||
console.log('[background] installed');
|
||||
}
|
||||
|
||||
/**
|
||||
* 浏览器启动并加载扩展时的初始化入口,当前仅保留日志方便调试生命周期。
|
||||
*/
|
||||
export async function handleStartup(): Promise<void> {
|
||||
console.log('[background] startup');
|
||||
await cancelStaleCrawlWhenWindowMissing();
|
||||
}
|
||||
|
||||
/**
|
||||
* 监听窗口关闭事件;如果关闭的是爬取窗口,就把当前任务标记为取消。
|
||||
*/
|
||||
export async function handleWindowRemoved(windowId: number): Promise<void> {
|
||||
console.log('[background] window removed', windowId);
|
||||
await cancelCrawlWhenWindowRemoved(windowId);
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据 popup/content 发来的 action 分发到对应的后台处理函数。
|
||||
*/
|
||||
export async function handleBackgroundCommand(
|
||||
message: BackgroundCommand,
|
||||
): Promise<BackgroundResponse | CrawlStateResponse> {
|
||||
switch (message.action) {
|
||||
case 'START_CRAWL':
|
||||
return startCrawl(message.payload.platformId);
|
||||
case 'GET_CRAWL_STATE':
|
||||
await cancelStaleCrawlWhenWindowMissing();
|
||||
return { ok: true, data: await getCrawlTaskState() };
|
||||
case 'CANCEL_CRAWL':
|
||||
return cancelCrawl();
|
||||
case 'RESUME_CRAWL':
|
||||
return resumeCrawl();
|
||||
default:
|
||||
return { ok: false, error: '未知的后台指令' };
|
||||
}
|
||||
}
|
||||
34
src/background/service/taskState.ts
Normal file
34
src/background/service/taskState.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import type { CrawlTaskState } from '@/types';
|
||||
|
||||
const CRAWL_TASK_STORAGE_KEY = 'crawlTaskState';
|
||||
|
||||
export async function getCrawlTaskState(): Promise<CrawlTaskState | null> {
|
||||
const result = await chrome.storage.local.get(CRAWL_TASK_STORAGE_KEY);
|
||||
const state = result[CRAWL_TASK_STORAGE_KEY];
|
||||
return isCrawlTaskState(state) ? state : null;
|
||||
}
|
||||
|
||||
export async function setCrawlTaskState(state: CrawlTaskState): Promise<void> {
|
||||
await chrome.storage.local.set({ [CRAWL_TASK_STORAGE_KEY]: state });
|
||||
}
|
||||
|
||||
export async function clearCrawlTaskState(): Promise<void> {
|
||||
await chrome.storage.local.remove(CRAWL_TASK_STORAGE_KEY);
|
||||
}
|
||||
|
||||
export async function updateCrawlTaskState(
|
||||
taskId: string,
|
||||
updater: (state: CrawlTaskState) => CrawlTaskState,
|
||||
): Promise<void> {
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (!state || state.id !== taskId || state.status === 'canceled') {
|
||||
return;
|
||||
}
|
||||
|
||||
await setCrawlTaskState(updater(state));
|
||||
}
|
||||
|
||||
function isCrawlTaskState(value: unknown): value is CrawlTaskState {
|
||||
return typeof value === 'object' && value !== null && 'id' in value && 'steps' in value;
|
||||
}
|
||||
@@ -1,40 +1,46 @@
|
||||
import type { CrawlTaskState } from '@/types';
|
||||
|
||||
/** 启动爬取任务的后台消息。 */
|
||||
// 启动爬取任务的后台消息。
|
||||
export interface StartCrawlCommand {
|
||||
/** 消息动作类型:请求 background 创建爬取窗口并初始化任务状态。 */
|
||||
// 消息动作类型:请求 background 创建爬取窗口并初始化任务状态。
|
||||
action: 'START_CRAWL';
|
||||
/** 启动爬取所需参数。 */
|
||||
// 启动爬取所需参数。
|
||||
payload: {
|
||||
/** 当前要爬取的平台 ID,对应 config/platforms.ts 中的平台配置。 */
|
||||
// 当前要爬取的平台 ID,对应 config/platforms.ts 中的平台配置。
|
||||
platformId: string;
|
||||
};
|
||||
}
|
||||
|
||||
/** 获取当前爬取任务状态的后台消息。 */
|
||||
// 获取当前爬取任务状态的后台消息。
|
||||
export interface GetCrawlStateCommand {
|
||||
/** 消息动作类型:请求 background 返回当前任务快照。 */
|
||||
// 消息动作类型:请求 background 返回当前任务快照。
|
||||
action: 'GET_CRAWL_STATE';
|
||||
}
|
||||
|
||||
/** 取消当前爬取任务的后台消息。 */
|
||||
// 取消当前爬取任务的后台消息。
|
||||
export interface CancelCrawlCommand {
|
||||
/** 消息动作类型:请求 background 标记任务取消并关闭爬取窗口。 */
|
||||
// 消息动作类型:请求 background 标记任务取消并关闭爬取窗口。
|
||||
action: 'CANCEL_CRAWL';
|
||||
}
|
||||
|
||||
/** popup/content script 能发送给 background 的全部消息类型。 */
|
||||
export type BackgroundCommand = StartCrawlCommand | GetCrawlStateCommand | CancelCrawlCommand;
|
||||
// 继续当前暂停中的爬取任务。
|
||||
export interface ResumeCrawlCommand {
|
||||
// 消息动作类型:用户已处理登录/验证码,允许 background 继续重试当前步骤。
|
||||
action: 'RESUME_CRAWL';
|
||||
}
|
||||
|
||||
/** background 统一响应结构。 */
|
||||
// popup/content script 能发送给 background 的全部消息类型。
|
||||
export type BackgroundCommand = StartCrawlCommand | GetCrawlStateCommand | CancelCrawlCommand | ResumeCrawlCommand;
|
||||
|
||||
// background 统一响应结构。
|
||||
export interface BackgroundResponse<T = unknown> {
|
||||
/** 当前请求是否处理成功。 */
|
||||
// 当前请求是否处理成功。
|
||||
ok: boolean;
|
||||
/** 成功或部分失败时返回的业务数据。 */
|
||||
// 成功或部分失败时返回的业务数据。
|
||||
data?: T;
|
||||
/** 请求失败时返回的错误文案。 */
|
||||
// 请求失败时返回的错误文案。
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/** 获取或变更爬取任务后返回的响应结构。 */
|
||||
// 获取或变更爬取任务后返回的响应结构。
|
||||
export type CrawlStateResponse = BackgroundResponse<CrawlTaskState | null>;
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import type {PlatformConfig} from '@/types';
|
||||
|
||||
import type { PlatformConfig } from '@/types';
|
||||
|
||||
export const PLATFORM_CONFIGS: PlatformConfig[] = [
|
||||
export const platformConfigs: PlatformConfig[] = [
|
||||
{
|
||||
id: 'Shopee',
|
||||
name: 'Shopee 后台',
|
||||
@@ -10,7 +9,7 @@ export const PLATFORM_CONFIGS: PlatformConfig[] = [
|
||||
name: '数据看板',
|
||||
uniqueKey: 'databoard',
|
||||
url: 'https://seller.shopee.com.my/',
|
||||
checkSelector: '.rate-manager-content',
|
||||
checkSelector: '.page-container',
|
||||
fields: [
|
||||
{
|
||||
label: "出货统计",
|
||||
@@ -36,46 +35,46 @@ export const PLATFORM_CONFIGS: PlatformConfig[] = [
|
||||
},
|
||||
{
|
||||
label: "商业分析",
|
||||
className: ".data-dashboard-async-data-wrapper .custom-row",
|
||||
className: ".data-dashboard .metrics",
|
||||
keys: [
|
||||
{
|
||||
label: "销售",
|
||||
className: ".custom-col-5:nth-child(1) ",
|
||||
className: ".metric:nth-child(1) ",
|
||||
keys: [
|
||||
{ label: "value", className: ".dashboard-item-value" },
|
||||
{ label: "change", className: ".dashboard-item-rate-number" }
|
||||
{label: "value", className: ".metric-value"},
|
||||
{label: "change", className: ".metric-rate"}
|
||||
]
|
||||
},
|
||||
{
|
||||
label: "访客数",
|
||||
className: ".custom-col-5:nth-child(2) ",
|
||||
className: ".metric:nth-child(2) ",
|
||||
keys: [
|
||||
{ label: "value", className: ".dashboard-item-value" },
|
||||
{ label: "change", className: ".dashboard-item-rate-number" }
|
||||
{label: "value", className: ".metric-value"},
|
||||
{label: "change", className: ".metric-rate"}
|
||||
]
|
||||
},
|
||||
{
|
||||
label: "Product Clicks",
|
||||
className: ".custom-col-5:nth-child(3)",
|
||||
className: ".metric:nth-child(3)",
|
||||
keys: [
|
||||
{ label: "value", className: ".dashboard-item-value" },
|
||||
{ label: "change", className: ".dashboard-item-rate-number" }
|
||||
{label: "value", className: ".metric-value"},
|
||||
{label: "change", className: ".metric-rate"}
|
||||
]
|
||||
},
|
||||
{
|
||||
label: "订单",
|
||||
className: ".custom-col-5:nth-child(4)",
|
||||
className: ".metric:nth-child(4)",
|
||||
keys: [
|
||||
{ label: "value", className: ".dashboard-item-value" },
|
||||
{ label: "change", className: ".dashboard-item-rate-number" }
|
||||
{label: "value", className: ".metric-value"},
|
||||
{label: "change", className: ".metric-rate"}
|
||||
]
|
||||
},
|
||||
{
|
||||
label: "Order Conversion Rate",
|
||||
className: ".custom-col-5:nth-child(5)",
|
||||
className: ".metric:nth-child(5)",
|
||||
keys: [
|
||||
{ label: "value", className: ".dashboard-item-value" },
|
||||
{ label: "change", className: ".dashboard-item-rate-number" }
|
||||
{label: "value", className: ".metric-value"},
|
||||
{label: "change", className: ".metric-rate"}
|
||||
]
|
||||
},
|
||||
]
|
||||
@@ -88,42 +87,224 @@ export const PLATFORM_CONFIGS: PlatformConfig[] = [
|
||||
label: "广告余额",
|
||||
className: ".ads-data-cell:nth-of-type(1) ",
|
||||
keys: [
|
||||
{ label: "value", className: ".ads-data-report-number" },
|
||||
{label: "value", className: ".ads-data-report-number"},
|
||||
]
|
||||
},
|
||||
{
|
||||
label: "销售额",
|
||||
className: ".ads-data-cell:nth-child(3) ",
|
||||
keys: [
|
||||
{ label: "value", className: ".ads-data-report-number" },
|
||||
{ label: "change", className: ".ratio " }
|
||||
{label: "value", className: ".ads-data-report-number"},
|
||||
{label: "change", className: ".ratio "}
|
||||
]
|
||||
},
|
||||
{
|
||||
label: "花费",
|
||||
className: ".ads-data-cell:nth-child(4)",
|
||||
keys: [
|
||||
{ label: "value", className: ".ads-data-report-number" },
|
||||
{ label: "change", className: ".ratio " }
|
||||
{label: "value", className: ".ads-data-report-number"},
|
||||
{label: "change", className: ".ratio "}
|
||||
]
|
||||
},
|
||||
{
|
||||
label: "广告支出回报率",
|
||||
className: ".ads-data-cell:nth-child(5)",
|
||||
keys: [
|
||||
{ label: "value", className: ".ads-data-report-number" },
|
||||
{ label: "change", className: ".ratio " }
|
||||
{label: "value", className: ".ads-data-report-number"},
|
||||
{label: "change", className: ".ratio "}
|
||||
]
|
||||
},
|
||||
]
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: "广告中心",
|
||||
uniqueKey: "adscenter",
|
||||
url: "https://seller.shopee.com.my/portal/marketing/pas/index",
|
||||
checkSelector: '.page-container',
|
||||
fields: [
|
||||
{
|
||||
label: "我的账户",
|
||||
className: ".my-account-wrap",
|
||||
keys: [
|
||||
{
|
||||
label: "广告余额",
|
||||
className: ".credit-expense-label-wrapper:nth-child(1) .ellipsis-content"
|
||||
},
|
||||
{
|
||||
label: "今日广告花费",
|
||||
className: ".credit-expense-label-wrapper:nth-child(2) .ellipsis-content"
|
||||
},
|
||||
|
||||
]
|
||||
},
|
||||
{
|
||||
label: "进行中广告列表",
|
||||
className: ".eds-table__body-container",
|
||||
type: 2,
|
||||
condition: {
|
||||
list: [".eds-radio-group label:nth-child(3)"],
|
||||
time: 400
|
||||
},
|
||||
tableParts: [
|
||||
{name: "fixed", select: ".eds-table__fix-body"},
|
||||
{name: "main", select: ".eds-table__main-body"}
|
||||
],
|
||||
keys: [
|
||||
{
|
||||
label: "广告信息",
|
||||
className: ".info-containter",
|
||||
part: "fixed",
|
||||
keys: [
|
||||
{
|
||||
label: "广告名称",
|
||||
className: ".campaign-name-container"
|
||||
},
|
||||
{
|
||||
label: "广告类型",
|
||||
className: ".gmv-max-noti"
|
||||
},
|
||||
{
|
||||
label: "结束时间",
|
||||
className: ".time-edit-wrapper"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
label: "每日预算",
|
||||
part: "main",
|
||||
className: "td:nth-child(1)"
|
||||
},
|
||||
{
|
||||
label: "目标ROAS",
|
||||
part: "main",
|
||||
className: "td:nth-child(2)"
|
||||
},
|
||||
{
|
||||
label: "花费",
|
||||
part: "main",
|
||||
className: "td:nth-child(4)"
|
||||
},
|
||||
{
|
||||
label: "销售额",
|
||||
part: "main",
|
||||
className: "td:nth-child(5)"
|
||||
},
|
||||
{
|
||||
label: "广告支出回报率",
|
||||
part: "main",
|
||||
className: "td:nth-child(6)"
|
||||
}
|
||||
],
|
||||
pagination: {
|
||||
nextBtn: ".eds-pager__button-next", // 下一页按钮
|
||||
disabledClass: ".eds-button--disabled", // 按钮禁用时的class(用来判断结束)
|
||||
maxPage: 1, // 最大爬取页数
|
||||
delay: 2000 // 翻页后的等待加载时间
|
||||
},
|
||||
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
name: "评论管理",
|
||||
uniqueKey: "message",
|
||||
url: "https://seller.shopee.com.my/portal/settings/shop/rating?pageNumber=1&fromPageNumber=1&cursor=0&pageSize=20&replied=TO_REPLY&ratingStar=2&ratingStar=1",
|
||||
checkSelector: '.page-container',
|
||||
fields: [
|
||||
{
|
||||
label: "低星评论",
|
||||
className: ".border-solid.rounded",
|
||||
type: 1,
|
||||
keys: [
|
||||
{
|
||||
label: "用户",
|
||||
className: ".flex.items-center.justify-start .ml-2"
|
||||
},
|
||||
{
|
||||
label: "订单编号",
|
||||
className: ".underline.px-1"
|
||||
},
|
||||
{
|
||||
label: "商品名称",
|
||||
className: ".min-w-0.font-medium.break-all"
|
||||
},
|
||||
{
|
||||
label: "规格",
|
||||
className: ".min-w-0.font-medium.break-all + div"
|
||||
},
|
||||
{
|
||||
label: "评价内容",
|
||||
className: ".min-w-0.overflow-hidden",
|
||||
condition: {
|
||||
list: [
|
||||
"span.cursor-pointer"
|
||||
],
|
||||
time: 200,
|
||||
},
|
||||
|
||||
},
|
||||
],
|
||||
pagination: {
|
||||
nextBtn: ".eds-react-pagination-pager__button-next",
|
||||
maxPage: 2, // 最大爬取页数
|
||||
delay: 2000 // 翻页后的等待加载时间
|
||||
},
|
||||
|
||||
},
|
||||
]
|
||||
},
|
||||
{
|
||||
name: "账户健康状态",
|
||||
uniqueKey: "accounthealth",
|
||||
url: "https://seller.shopee.com.my/portal/accounthealth/home",
|
||||
checkSelector: '.page-container',
|
||||
fields: [
|
||||
{
|
||||
label: "健康状态",
|
||||
className: ".metric-content",
|
||||
type: 1,
|
||||
keys: [
|
||||
{
|
||||
label: "模块名",
|
||||
className: ".metric-type"
|
||||
},
|
||||
{
|
||||
label: "值",
|
||||
className: ".metric-item",
|
||||
type: 1,
|
||||
keys: [
|
||||
{
|
||||
label: "指标",
|
||||
className: "p.metric-text"
|
||||
},
|
||||
{
|
||||
label: "值",
|
||||
className: ".metric-my"
|
||||
},
|
||||
{
|
||||
label: "目标",
|
||||
className: ".metric-target"
|
||||
},
|
||||
{
|
||||
label: "使用类型",
|
||||
className: ".metric-applied-to"
|
||||
},
|
||||
]
|
||||
},
|
||||
],
|
||||
|
||||
},
|
||||
]
|
||||
}
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
/** 根据平台 ID 返回对应的平台抓取配置。 */
|
||||
/**
|
||||
* 根据平台 ID 返回对应的平台抓取配置。
|
||||
*/
|
||||
export function getPlatformById(platformId: string) {
|
||||
return PLATFORM_CONFIGS.find((item) => item.id === platformId) ?? null;
|
||||
return platformConfigs.find((item) => item.id === platformId) ?? null;
|
||||
}
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
<script setup lang="ts">
|
||||
<script setup lang="ts">
|
||||
import {computed, onMounted, onUnmounted, ref} from 'vue';
|
||||
import type {CrawlTaskState} from '@/types';
|
||||
|
||||
/** 当前后台保存的爬取任务快照,用于决定是否展示右下角浮窗。 */
|
||||
// 当前后台保存的爬取任务快照,用于决定是否展示右下角浮窗。
|
||||
const crawlState = ref<CrawlTaskState | null>(null);
|
||||
/** 当前爬取任务已经运行的秒数,页面上会格式化为 mm:ss。 */
|
||||
// 当前爬取任务已经运行的秒数,页面上会格式化为 mm:ss。
|
||||
const elapsedSeconds = ref(0);
|
||||
/** 控制右下角时间轴面板是否展开。 */
|
||||
// 控制右下角时间轴面板是否展开。
|
||||
const isPanelOpen = ref(false);
|
||||
/** 轮询后台爬取状态和刷新计时器的定时器 ID。 */
|
||||
// 轮询后台爬取状态和刷新计时器的定时器 ID。
|
||||
let timer: number | undefined;
|
||||
|
||||
/** 只有任务处于运行中时,才在网页右下角展示计时按钮。 */
|
||||
const isVisible = computed(() => crawlState.value?.status === 'running');
|
||||
// 只有任务处于运行中时,才在网页右下角展示计时按钮。
|
||||
const isVisible = computed(() => crawlState.value ? ['running', 'paused'].includes(crawlState.value.status) : false);
|
||||
|
||||
/** 内容脚本挂载后立即同步一次状态,并开始每秒刷新计时和任务进度。 */
|
||||
// 内容脚本挂载后立即同步一次状态,并开始每秒刷新计时和任务进度。
|
||||
onMounted(() => {
|
||||
void refreshCrawlState();
|
||||
timer = window.setInterval(() => {
|
||||
@@ -23,16 +23,18 @@ onMounted(() => {
|
||||
}, 1000);
|
||||
});
|
||||
|
||||
/** 内容脚本卸载时清理定时器,避免页面残留轮询。 */
|
||||
// 内容脚本卸载时清理定时器,避免页面残留轮询。
|
||||
onUnmounted(() => {
|
||||
if (timer) {
|
||||
window.clearInterval(timer);
|
||||
}
|
||||
});
|
||||
|
||||
/** 从 background 获取最新爬取任务状态,并在任务结束时自动收起面板。 */
|
||||
/**
|
||||
* 从 background 获取最新爬取任务状态,并在任务结束时自动收起面板。
|
||||
*/
|
||||
async function refreshCrawlState() {
|
||||
/** background 返回的当前爬取任务状态响应。 */
|
||||
// background 返回的当前爬取任务状态响应。
|
||||
const response = await sendBackgroundMessage<CrawlTaskState | null>({action: 'GET_CRAWL_STATE'});
|
||||
|
||||
if (response.ok) {
|
||||
@@ -45,7 +47,9 @@ async function refreshCrawlState() {
|
||||
}
|
||||
}
|
||||
|
||||
/** 根据任务开始时间实时计算已经运行的秒数。 */
|
||||
/**
|
||||
* 根据任务开始时间实时计算已经运行的秒数。
|
||||
*/
|
||||
function updateElapsedSeconds() {
|
||||
if (!crawlState.value) {
|
||||
elapsedSeconds.value = 0;
|
||||
@@ -55,18 +59,22 @@ function updateElapsedSeconds() {
|
||||
elapsedSeconds.value = Math.max(0, Math.floor((Date.now() - crawlState.value.startedAt) / 1000));
|
||||
}
|
||||
|
||||
/** 将秒数格式化为 mm:ss,展示在圆形计时按钮和面板标题里。 */
|
||||
/**
|
||||
* 将秒数格式化为 mm:ss,展示在圆形计时按钮和面板标题里。
|
||||
*/
|
||||
function formatElapsed(totalSeconds: number): string {
|
||||
/** 运行时长中的分钟部分。 */
|
||||
// 运行时长中的分钟部分。
|
||||
const minutes = Math.floor(totalSeconds / 60).toString().padStart(2, '0');
|
||||
/** 运行时长中的秒数部分。 */
|
||||
// 运行时长中的秒数部分。
|
||||
const seconds = (totalSeconds % 60).toString().padStart(2, '0');
|
||||
return `${minutes}:${seconds}`;
|
||||
}
|
||||
|
||||
/** 将步骤状态枚举转换成中文展示文案。 */
|
||||
/**
|
||||
* 将步骤状态枚举转换成中文展示文案。
|
||||
*/
|
||||
function getStepText(status: string): string {
|
||||
/** 步骤状态到展示文案的映射表。 */
|
||||
// 步骤状态到展示文案的映射表。
|
||||
const textMap: Record<string, string> = {
|
||||
pending: '等待中',
|
||||
running: '爬取中',
|
||||
@@ -77,7 +85,17 @@ function getStepText(status: string): string {
|
||||
return textMap[status] ?? status;
|
||||
}
|
||||
|
||||
/** 发送消息到 background;非扩展环境下返回空成功响应,方便本地页面不报错。 */
|
||||
/**
|
||||
* 请求 background 继续暂停中的爬取任务。
|
||||
*/
|
||||
async function handleResumeCrawl() {
|
||||
await sendBackgroundMessage({ action: 'RESUME_CRAWL' });
|
||||
await refreshCrawlState();
|
||||
}
|
||||
|
||||
/**
|
||||
* 发送消息到 background;非扩展环境下返回空成功响应,方便本地页面不报错。
|
||||
*/
|
||||
function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data?: T; error?: string }> {
|
||||
if (typeof chrome === 'undefined' || !chrome.runtime?.sendMessage) {
|
||||
return Promise.resolve({ok: true, data: null as T});
|
||||
@@ -104,15 +122,20 @@ function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data
|
||||
<ol class="dianshan-crawl-timeline">
|
||||
<li v-for="(step, index) in crawlState.steps" :key="step.uniqueKey" :class="`is-${step.status}`">
|
||||
<span class="dianshan-crawl-dot"></span>
|
||||
<div class="dianshan-crawl-step">
|
||||
<strong>{{ index + 1 }}. {{ step.name }}</strong>
|
||||
<em>{{ getStepText(step.status) }}</em>
|
||||
<small v-if="step.message">{{ step.message }}</small>
|
||||
</div>
|
||||
</li>
|
||||
</ol>
|
||||
</section>
|
||||
</div>
|
||||
<div class="dianshan-crawl-step">
|
||||
<strong>{{ index + 1 }}. {{ step.name }}</strong>
|
||||
<em>{{ getStepText(step.status) }}</em>
|
||||
<small v-if="step.message">{{ step.message }}</small>
|
||||
</div>
|
||||
</li>
|
||||
</ol>
|
||||
|
||||
<div v-if="crawlState.status === 'paused' && crawlState.pause" class="dianshan-crawl-pause">
|
||||
<p>{{ crawlState.pause.message }}</p>
|
||||
<button type="button" @click="handleResumeCrawl">我已处理,继续</button>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<style scoped>
|
||||
@@ -213,6 +236,35 @@ function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data
|
||||
color: #b91c1c;
|
||||
}
|
||||
|
||||
.dianshan-crawl-pause {
|
||||
display: grid;
|
||||
gap: 8px;
|
||||
margin-top: 12px;
|
||||
padding: 10px;
|
||||
border: 1px solid #f59e0b;
|
||||
border-radius: 8px;
|
||||
background: #fffbeb;
|
||||
}
|
||||
|
||||
.dianshan-crawl-pause p {
|
||||
margin: 0;
|
||||
color: #92400e;
|
||||
font-size: 12px;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.dianshan-crawl-pause button {
|
||||
width: 100%;
|
||||
border: 0;
|
||||
border-radius: 6px;
|
||||
padding: 8px 10px;
|
||||
color: #ffffff;
|
||||
background: #059669;
|
||||
cursor: pointer;
|
||||
font-size: 12px;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.is-running .dianshan-crawl-dot,
|
||||
.is-success .dianshan-crawl-dot {
|
||||
background: #10b981;
|
||||
|
||||
@@ -1,22 +1,24 @@
|
||||
import { createApp } from 'vue';
|
||||
import App from './App.vue';
|
||||
import { setupPageRunner } from './pageRunner';
|
||||
|
||||
/** 将内容脚本应用挂载到页面中。 */
|
||||
/**
|
||||
* 将内容脚本应用挂载到页面中。
|
||||
*/
|
||||
function mountApp() {
|
||||
if (document.getElementById('dianshan-crx-root')) {
|
||||
return;
|
||||
}
|
||||
|
||||
/** 内容脚本在宿主页面中的根容器,用于避免污染业务页面结构。 */
|
||||
const container = document.createElement('div');
|
||||
container.id = 'dianshan-crx-root';
|
||||
/** Vue 应用实际挂载的节点。 */
|
||||
const appRoot = document.createElement('div');
|
||||
|
||||
container.appendChild(appRoot);
|
||||
document.body.appendChild(container);
|
||||
|
||||
createApp(App).mount(appRoot);
|
||||
setupPageRunner();
|
||||
}
|
||||
|
||||
if (document.readyState === 'loading') {
|
||||
|
||||
207
src/content/pageRunner.ts
Normal file
207
src/content/pageRunner.ts
Normal file
@@ -0,0 +1,207 @@
|
||||
import { processFields, type DomScrapeResult } from '@/background/domScraper';
|
||||
import type { CrawlPauseInfo, PlatformFieldConfig } from '@/types';
|
||||
|
||||
interface ScrapeStepMessage {
|
||||
action: 'SCRAPE_STEP';
|
||||
payload: {
|
||||
fields: PlatformFieldConfig[];
|
||||
checkSelector: string;
|
||||
};
|
||||
}
|
||||
|
||||
interface CheckInterruptMessage {
|
||||
action: 'CHECK_INTERRUPT';
|
||||
}
|
||||
|
||||
type PageRunnerMessage = ScrapeStepMessage | CheckInterruptMessage;
|
||||
|
||||
interface PageRunnerResponse {
|
||||
ok: boolean;
|
||||
data?: DomScrapeResult | null;
|
||||
interrupt?: CrawlPauseInfo;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* 注册页面执行器,供 background 在目标网页中触发中断检测和 DOM 抓取。
|
||||
*/
|
||||
export function setupPageRunner(): void {
|
||||
chrome.runtime.onMessage.addListener((message: PageRunnerMessage, _sender, sendResponse) => {
|
||||
void handlePageRunnerMessage(message).then(sendResponse);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* 处理 background 发来的页面执行消息。
|
||||
*/
|
||||
async function handlePageRunnerMessage(message: PageRunnerMessage): Promise<PageRunnerResponse> {
|
||||
if (message.action === 'CHECK_INTERRUPT') {
|
||||
return { ok: true, interrupt: detectPageInterrupt() };
|
||||
}
|
||||
|
||||
if (message.action === 'SCRAPE_STEP') {
|
||||
const interrupt = detectPageInterrupt();
|
||||
|
||||
if (interrupt) {
|
||||
return { ok: false, interrupt };
|
||||
}
|
||||
|
||||
const readyElement = await waitForStableSelector(message.payload.checkSelector, 18000);
|
||||
|
||||
if (!readyElement) {
|
||||
return {
|
||||
ok: false,
|
||||
interrupt: {
|
||||
reason: 'page_not_ready',
|
||||
message: '页面关键内容暂未加载,请确认页面是否正常显示后继续',
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const data = await processFields(message.payload.fields, document.body);
|
||||
return { ok: true, data };
|
||||
}
|
||||
|
||||
return { ok: false, error: '未知页面执行指令' };
|
||||
}
|
||||
|
||||
/**
|
||||
* 检测当前页面是否需要用户手动处理登录、验证码或页面不存在。
|
||||
*/
|
||||
function detectPageInterrupt(): CrawlPauseInfo | undefined {
|
||||
if (isShieldPage()) {
|
||||
return {
|
||||
reason: 'shield',
|
||||
message: '检测到验证码或风控验证,请在打开的商家后台窗口处理完成后继续',
|
||||
};
|
||||
}
|
||||
|
||||
if (isLoginPage()) {
|
||||
return {
|
||||
reason: 'reauth',
|
||||
message: '检测到需要重新登录,请在打开的商家后台窗口登录完成后继续',
|
||||
};
|
||||
}
|
||||
|
||||
if (isNotFoundPage()) {
|
||||
return {
|
||||
reason: 'not_found',
|
||||
message: '当前页面不存在或已失效,请确认平台配置里的页面地址是否正确',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否进入验证码、流量盾或风控验证页。
|
||||
*/
|
||||
function isShieldPage(): boolean {
|
||||
const path = location.pathname.toLowerCase();
|
||||
|
||||
if (path.startsWith('/verify/captcha') || path.startsWith('/verify/traffic')) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const shieldElement = document.querySelector(
|
||||
'[data-name="verification"], .ant-captcha, #captchaContainer, [class*="captcha" i], [id*="captcha" i]',
|
||||
);
|
||||
|
||||
return shieldElement ? isVisibleElement(shieldElement) : false;
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断当前页面是否需要登录或二次验证密码。
|
||||
*/
|
||||
function isLoginPage(): boolean {
|
||||
const path = location.pathname.toLowerCase();
|
||||
|
||||
if (
|
||||
/^\/(?:buyer\/)?login\b/i.test(path) ||
|
||||
/^\/account\/(?:signin|login)\b/i.test(path) ||
|
||||
/^\/portal\/login\b/i.test(path)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const visiblePasswordInput = Array.from(document.querySelectorAll('input[type="password"]')).some(isVisibleElement);
|
||||
|
||||
if (visiblePasswordInput) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const bodyText = document.body.innerText.slice(0, 3000);
|
||||
const loginTextPatterns = [
|
||||
/enter\s+(your\s+)?password\s+to\s+continue/i,
|
||||
/sign\s+in\s+(again\s+)?to\s+continue/i,
|
||||
/please\s+(re-?)?enter\s+(your\s+)?password/i,
|
||||
/请(再次|重新)?输入(您的)?密码/,
|
||||
/请登录|重新登录|登录后继续/,
|
||||
];
|
||||
|
||||
return loginTextPatterns.some((pattern) => pattern.test(bodyText));
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断当前页面是否是不存在、下架或错误页面。
|
||||
*/
|
||||
function isNotFoundPage(): boolean {
|
||||
const text = document.body.innerText.slice(0, 8000);
|
||||
const title = document.title;
|
||||
const notFoundPatterns = [
|
||||
/page\s+not\s+found/i,
|
||||
/the\s+page\s+you\s+are\s+looking\s+for/i,
|
||||
/this\s+page\s+(has\s+been\s+)?removed/i,
|
||||
/product\s+(is\s+)?unavailable/i,
|
||||
/页面不存在|找不到(此|该)?页面|抱歉.*不存在|(商品|产品)已下架/,
|
||||
];
|
||||
|
||||
return notFoundPatterns.some((pattern) => pattern.test(title) || pattern.test(text));
|
||||
}
|
||||
|
||||
/**
|
||||
* 等待页面中出现稳定的关键元素。
|
||||
*/
|
||||
async function waitForStableSelector(selector: string, timeoutMs: number): Promise<Element | null> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
|
||||
while (Date.now() < deadline) {
|
||||
const element = document.querySelector(selector);
|
||||
|
||||
if (element && isVisibleElement(element)) {
|
||||
await sleep(600);
|
||||
const stableElement = document.querySelector(selector);
|
||||
return stableElement && isVisibleElement(stableElement) ? stableElement : null;
|
||||
}
|
||||
|
||||
await sleep(500);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断元素是否真实可见。
|
||||
*/
|
||||
function isVisibleElement(element: Element): boolean {
|
||||
if (!element.isConnected) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const style = element.ownerDocument.defaultView?.getComputedStyle(element);
|
||||
|
||||
if (!style || style.display === 'none' || style.visibility === 'hidden' || Number(style.opacity) < 0.05) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const rect = element.getBoundingClientRect();
|
||||
return rect.width > 0 && rect.height > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 简单等待工具。
|
||||
*/
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => {
|
||||
window.setTimeout(resolve, ms);
|
||||
});
|
||||
}
|
||||
@@ -1,111 +1,39 @@
|
||||
<script setup lang="ts">
|
||||
import { computed, onMounted, onUnmounted, ref } from 'vue';
|
||||
import { PLATFORM_CONFIGS } from '@/config/platforms';
|
||||
import { getToken, logout, mockLogin } from '@/shared/auth';
|
||||
import type { CrawlTaskState } from '@/types';
|
||||
import {useLogin} from "./hook/use-login";
|
||||
import {platformConfigs} from "@/config/platforms";
|
||||
import {useScan} from "./hook/use-scan";
|
||||
import {computed} from "vue";
|
||||
import {formatSeconds} from "@/shared/time_format";
|
||||
|
||||
const token = ref<string | null>(null);
|
||||
const selectedPlatformId = ref(PLATFORM_CONFIGS[0]?.id ?? '');
|
||||
const isLoading = ref(true);
|
||||
const isScanning = ref(false);
|
||||
const errorMessage = ref('');
|
||||
const crawlState = ref<CrawlTaskState | null>(null);
|
||||
const elapsedSeconds = ref(0);
|
||||
let timer: number | undefined;
|
||||
/**
|
||||
* 登录逻辑
|
||||
*/
|
||||
const {isLoggedIn, handleLogin, handleLogout} = useLogin()
|
||||
|
||||
const selectedPlatform = computed(() =>
|
||||
PLATFORM_CONFIGS.find((platform) => platform.id === selectedPlatformId.value) ?? null,
|
||||
/**
|
||||
* 爬取逻辑的数据
|
||||
*/
|
||||
const {
|
||||
selectedPlatformId,
|
||||
isScanning,
|
||||
crawlState,
|
||||
handleScan,
|
||||
handleCancelCrawl,
|
||||
elapsedSeconds
|
||||
} = useScan()
|
||||
|
||||
|
||||
/**
|
||||
* 显示进度条
|
||||
*/
|
||||
const shouldShowCrawlProgress = computed<boolean>(() =>
|
||||
crawlState.value != null
|
||||
);
|
||||
|
||||
const isLoggedIn = computed(() => token.value !== null);
|
||||
const isCrawling = computed(() => crawlState.value?.status === 'running');
|
||||
|
||||
onMounted(async () => {
|
||||
token.value = await getToken();
|
||||
await refreshCrawlState();
|
||||
timer = window.setInterval(() => {
|
||||
updateElapsedSeconds();
|
||||
void refreshCrawlState();
|
||||
}, 1000);
|
||||
isLoading.value = false;
|
||||
});
|
||||
|
||||
onUnmounted(() => {
|
||||
if (timer) {
|
||||
window.clearInterval(timer);
|
||||
}
|
||||
});
|
||||
|
||||
async function handleLogin() {
|
||||
errorMessage.value = '';
|
||||
token.value = await mockLogin();
|
||||
}
|
||||
|
||||
async function handleLogout() {
|
||||
errorMessage.value = '';
|
||||
await logout();
|
||||
token.value = null;
|
||||
}
|
||||
|
||||
async function handleScan() {
|
||||
errorMessage.value = '';
|
||||
|
||||
if (!selectedPlatform.value) {
|
||||
errorMessage.value = '请选择要爬取的平台';
|
||||
return;
|
||||
}
|
||||
|
||||
isScanning.value = true;
|
||||
|
||||
try {
|
||||
const response = await sendBackgroundMessage<CrawlTaskState>({
|
||||
action: 'START_CRAWL',
|
||||
payload: { platformId: selectedPlatform.value.id },
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
errorMessage.value = response.error ?? '打开平台窗口失败';
|
||||
return;
|
||||
}
|
||||
|
||||
crawlState.value = response.data ?? null;
|
||||
updateElapsedSeconds();
|
||||
} catch (error: unknown) {
|
||||
errorMessage.value = error instanceof Error ? error.message : '打开平台窗口失败';
|
||||
} finally {
|
||||
isScanning.value = false;
|
||||
}
|
||||
}
|
||||
|
||||
async function handleCancelCrawl() {
|
||||
const response = await sendBackgroundMessage<CrawlTaskState>({ action: 'CANCEL_CRAWL' });
|
||||
crawlState.value = response.data ?? null;
|
||||
}
|
||||
|
||||
async function refreshCrawlState() {
|
||||
const response = await sendBackgroundMessage<CrawlTaskState | null>({ action: 'GET_CRAWL_STATE' });
|
||||
|
||||
if (response.ok) {
|
||||
crawlState.value = response.data ?? null;
|
||||
updateElapsedSeconds();
|
||||
}
|
||||
}
|
||||
|
||||
function updateElapsedSeconds() {
|
||||
if (!crawlState.value) {
|
||||
elapsedSeconds.value = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
elapsedSeconds.value = Math.max(0, Math.floor((Date.now() - crawlState.value.startedAt) / 1000));
|
||||
}
|
||||
|
||||
function formatElapsed(totalSeconds: number): string {
|
||||
const minutes = Math.floor(totalSeconds / 60).toString().padStart(2, '0');
|
||||
const seconds = (totalSeconds % 60).toString().padStart(2, '0');
|
||||
return `${minutes}:${seconds}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取进度样式
|
||||
*/
|
||||
function getStepClass(status: string): string {
|
||||
if (status === 'running') {
|
||||
return 'border-emerald-500 bg-emerald-50 text-emerald-700';
|
||||
@@ -133,13 +61,6 @@ function getStepText(status: string): string {
|
||||
return textMap[status] ?? status;
|
||||
}
|
||||
|
||||
function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data?: T; error?: string }> {
|
||||
if (typeof chrome === 'undefined' || !chrome.runtime?.sendMessage) {
|
||||
return Promise.resolve({ ok: true, data: null as T });
|
||||
}
|
||||
|
||||
return chrome.runtime.sendMessage(message);
|
||||
}
|
||||
</script>
|
||||
|
||||
<template>
|
||||
@@ -150,43 +71,60 @@ function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data
|
||||
<p class="text-sm leading-5 text-slate-600">自动打开商家后台,按平台配置顺序采集页面数据</p>
|
||||
</header>
|
||||
|
||||
<div v-if="isLoading" class="rounded-md border border-slate-200 bg-white px-3 py-4 text-sm text-slate-500">
|
||||
正在读取登录状态...
|
||||
</div>
|
||||
|
||||
<template v-else-if="!isLoggedIn">
|
||||
<template v-if="!isLoggedIn">
|
||||
<button type="button"
|
||||
class="rounded-md bg-slate-900 px-4 py-2.5 text-sm font-medium text-white transition hover:bg-slate-700"
|
||||
@click="handleLogin">
|
||||
class="rounded-md bg-slate-900 px-4 py-2.5 text-sm font-medium text-white transition hover:bg-slate-700"
|
||||
@click="handleLogin">
|
||||
请登录
|
||||
</button>
|
||||
</template>
|
||||
|
||||
<template v-else-if="isCrawling && crawlState">
|
||||
<template v-else-if="shouldShowCrawlProgress && crawlState">
|
||||
<section class="space-y-4">
|
||||
<div class="flex items-center justify-between rounded-md bg-white px-3 py-2 shadow-sm">
|
||||
<div>
|
||||
<p class="text-sm font-medium text-slate-800">{{ crawlState.platformName }}</p>
|
||||
<p class="text-xs text-slate-500">已运行 {{ formatElapsed(elapsedSeconds) }}</p>
|
||||
<p class="text-xs text-slate-500">
|
||||
{{
|
||||
crawlState.status === 'paused' ? '已暂停' : '已运行 ' + formatSeconds(elapsedSeconds)
|
||||
}}
|
||||
</p>
|
||||
</div>
|
||||
<button type="button" class="text-xs text-red-600 transition hover:text-red-700"
|
||||
@click="handleCancelCrawl">
|
||||
取消
|
||||
</button>
|
||||
<div class="flex items-center gap-2">
|
||||
<!-- <button v-if="crawlState.status === 'paused'" type="button"-->
|
||||
<!-- class="text-xs text-emerald-600 transition hover:text-emerald-700"-->
|
||||
<!-- @click="handleResumeCrawl">-->
|
||||
<!-- 继续-->
|
||||
<!-- </button>-->
|
||||
<button type="button" class="text-xs text-red-600 transition hover:text-red-700"
|
||||
@click="handleCancelCrawl">
|
||||
取消
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div v-if="crawlState.status === 'paused' && crawlState.pause"
|
||||
class="rounded-md border border-amber-200 bg-amber-50 px-3 py-2 text-sm text-amber-800">
|
||||
{{ crawlState.pause.message }}
|
||||
</div>
|
||||
|
||||
<ol class="space-y-3">
|
||||
<li v-for="(step, index) in crawlState.steps" :key="step.uniqueKey"
|
||||
class="relative border-l-2 border-slate-200 pl-4">
|
||||
<span
|
||||
class="absolute -left-[7px] top-1 h-3 w-3 rounded-full border-2 border-white bg-slate-300"
|
||||
:class="{ 'bg-emerald-500': step.status === 'running' || step.status === 'success', 'bg-red-500': step.status === 'failed' }"></span>
|
||||
<span
|
||||
class="absolute -left-[7px] top-1 h-3 w-3 rounded-full border-2 border-white bg-slate-300"
|
||||
:class="{ 'bg-emerald-500': step.status === 'running' || step.status === 'success', 'bg-red-500': step.status === 'failed' }"></span>
|
||||
<div class="rounded-md border px-3 py-2 text-sm" :class="getStepClass(step.status)">
|
||||
<div class="flex items-center justify-between gap-3">
|
||||
<span class="font-medium">{{ index + 1 }}. {{ step.name }}</span>
|
||||
<span class="text-xs">{{ getStepText(step.status) }}</span>
|
||||
</div>
|
||||
<p v-if="step.message" class="mt-1 text-xs">{{ step.message }}</p>
|
||||
<pre v-if="step.result !== undefined"
|
||||
class="mt-2 max-h-32 overflow-auto rounded bg-slate-950 p-2 text-[11px] leading-4 text-slate-100">{{
|
||||
JSON.stringify(step.result, null, 2)
|
||||
}}</pre>
|
||||
</div>
|
||||
</li>
|
||||
</ol>
|
||||
@@ -197,28 +135,27 @@ function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data
|
||||
<label class="space-y-2">
|
||||
<span class="text-sm font-medium text-slate-700">平台选择</span>
|
||||
<select v-model="selectedPlatformId"
|
||||
class="w-full rounded-md border border-slate-300 bg-white px-3 py-2 text-sm outline-none transition focus:border-slate-800 focus:ring-2 focus:ring-slate-200">
|
||||
<option v-for="platform in PLATFORM_CONFIGS" :key="platform.id" :value="platform.id">
|
||||
class="w-full rounded-md border border-slate-300 bg-white px-3 py-2 text-sm outline-none transition focus:border-slate-800 focus:ring-2 focus:ring-slate-200">
|
||||
<option v-for="platform in platformConfigs"
|
||||
:key="platform.id"
|
||||
:value="platform.id">
|
||||
{{ platform.name }}
|
||||
</option>
|
||||
</select>
|
||||
</label>
|
||||
|
||||
<button type="button"
|
||||
class="rounded-md bg-emerald-600 px-4 py-2.5 text-sm font-medium text-white transition hover:bg-emerald-500 disabled:cursor-not-allowed disabled:bg-slate-300"
|
||||
:disabled="isScanning" @click="handleScan">
|
||||
class="rounded-md bg-emerald-600 px-4 py-2.5 text-sm font-medium text-white transition hover:bg-emerald-500 disabled:cursor-not-allowed disabled:bg-slate-300"
|
||||
:disabled="isScanning" @click="handleScan">
|
||||
{{ isScanning ? '正在打开...' : '立即爬取' }}
|
||||
</button>
|
||||
</template>
|
||||
|
||||
<p v-if="errorMessage" class="rounded-md bg-red-50 px-3 py-2 text-sm text-red-700">
|
||||
{{ errorMessage }}
|
||||
</p>
|
||||
|
||||
<footer
|
||||
class="mt-auto flex items-center justify-between border-t border-slate-200 pt-4 text-xs text-slate-500">
|
||||
<button v-if="isLoggedIn" type="button" class="text-slate-600 transition hover:text-slate-900"
|
||||
@click="handleLogout">
|
||||
@click="handleLogout">
|
||||
退出
|
||||
</button>
|
||||
<span v-else></span>
|
||||
|
||||
35
src/popup/hook/use-login.ts
Normal file
35
src/popup/hook/use-login.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import {computed, onMounted, ref} from "vue";
|
||||
import {getToken, logout, setToken} from "@/shared/auth";
|
||||
|
||||
export const useLogin = () => {
|
||||
const token = ref<string | null>(null);
|
||||
|
||||
const isLoggedIn = computed(() => token.value !== null);
|
||||
|
||||
/**
|
||||
* 登录
|
||||
*/
|
||||
const handleLogin = async () => {
|
||||
let value = "xxx"
|
||||
await setToken(value)
|
||||
token.value = value
|
||||
}
|
||||
|
||||
/**
|
||||
* 退出登录
|
||||
*/
|
||||
const handleLogout = async () => {
|
||||
await logout()
|
||||
token.value = null
|
||||
}
|
||||
|
||||
onMounted(async () => {
|
||||
token.value = await getToken()
|
||||
})
|
||||
|
||||
return {
|
||||
isLoggedIn,
|
||||
handleLogin,
|
||||
handleLogout,
|
||||
}
|
||||
}
|
||||
144
src/popup/hook/use-scan.ts
Normal file
144
src/popup/hook/use-scan.ts
Normal file
@@ -0,0 +1,144 @@
|
||||
import { onMounted, onUnmounted, ref } from 'vue';
|
||||
import { platformConfigs } from '@/config/platforms';
|
||||
import type { CrawlTaskState } from '@/types';
|
||||
import { sendBackgroundMessage } from '@/shared/message';
|
||||
|
||||
const CRAWL_TASK_STORAGE_KEY = 'crawlTaskState';
|
||||
const ACTIVE_STATUSES = new Set(['running', 'paused']);
|
||||
|
||||
export const useScan = () => {
|
||||
const selectedPlatformId = ref(platformConfigs[0]?.id ?? '');
|
||||
const isScanning = ref<boolean>(false);
|
||||
const crawlState = ref<CrawlTaskState | null>(null);
|
||||
const elapsedSeconds = ref<number>(0);
|
||||
|
||||
let timer: number | undefined;
|
||||
|
||||
const handleScan = async () => {
|
||||
if (isScanning.value) {
|
||||
return;
|
||||
}
|
||||
|
||||
isScanning.value = true;
|
||||
|
||||
try {
|
||||
ensureElapsedTimer();
|
||||
|
||||
const response = await sendBackgroundMessage<CrawlTaskState>({
|
||||
action: 'START_CRAWL',
|
||||
payload: { platformId: selectedPlatformId.value },
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
syncCrawlState(response.data ?? null);
|
||||
} else {
|
||||
console.error('[crawl] start failed', response.error);
|
||||
}
|
||||
} finally {
|
||||
isScanning.value = false;
|
||||
}
|
||||
};
|
||||
|
||||
const handleCancelCrawl = async () => {
|
||||
const response = await sendBackgroundMessage<CrawlTaskState | null>({ action: 'CANCEL_CRAWL' });
|
||||
|
||||
if (response.ok) {
|
||||
syncCrawlState(response.data ?? null);
|
||||
return;
|
||||
}
|
||||
|
||||
console.error('[crawl] cancel failed', response.error);
|
||||
await refreshCrawlState();
|
||||
};
|
||||
|
||||
function syncCrawlState(state: CrawlTaskState | null) {
|
||||
crawlState.value = state;
|
||||
updateSeconds();
|
||||
|
||||
if (state && ACTIVE_STATUSES.has(state.status)) {
|
||||
ensureElapsedTimer();
|
||||
return;
|
||||
}
|
||||
|
||||
clearElapsedTimer();
|
||||
}
|
||||
|
||||
function ensureElapsedTimer() {
|
||||
if (timer !== undefined) {
|
||||
return;
|
||||
}
|
||||
|
||||
timer = window.setInterval(() => {
|
||||
updateSeconds();
|
||||
}, 1000);
|
||||
}
|
||||
|
||||
function clearElapsedTimer() {
|
||||
if (timer === undefined) {
|
||||
return;
|
||||
}
|
||||
|
||||
window.clearInterval(timer);
|
||||
timer = undefined;
|
||||
}
|
||||
|
||||
function updateSeconds() {
|
||||
if (!crawlState.value) {
|
||||
elapsedSeconds.value = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
elapsedSeconds.value = Math.max(0, Math.floor((Date.now() - crawlState.value.startedAt) / 1000));
|
||||
}
|
||||
|
||||
async function refreshCrawlState() {
|
||||
const response = await sendBackgroundMessage<CrawlTaskState | null>({ action: 'GET_CRAWL_STATE' });
|
||||
|
||||
if (response.ok) {
|
||||
syncCrawlState(response.data ?? null);
|
||||
}
|
||||
}
|
||||
|
||||
function handleStorageChanged(changes: Record<string, chrome.storage.StorageChange>, areaName: string) {
|
||||
if (areaName !== 'local') {
|
||||
return;
|
||||
}
|
||||
|
||||
const change = changes[CRAWL_TASK_STORAGE_KEY];
|
||||
|
||||
if (!change) {
|
||||
return;
|
||||
}
|
||||
|
||||
syncCrawlState(isCrawlTaskState(change.newValue) ? change.newValue : null);
|
||||
}
|
||||
|
||||
onMounted(async () => {
|
||||
await refreshCrawlState();
|
||||
|
||||
if (typeof chrome !== 'undefined' && chrome.storage?.onChanged) {
|
||||
chrome.storage.onChanged.addListener(handleStorageChanged);
|
||||
}
|
||||
});
|
||||
|
||||
onUnmounted(() => {
|
||||
clearElapsedTimer();
|
||||
|
||||
if (typeof chrome !== 'undefined' && chrome.storage?.onChanged) {
|
||||
chrome.storage.onChanged.removeListener(handleStorageChanged);
|
||||
}
|
||||
});
|
||||
|
||||
return {
|
||||
selectedPlatformId,
|
||||
isScanning,
|
||||
crawlState,
|
||||
handleScan,
|
||||
handleCancelCrawl,
|
||||
elapsedSeconds,
|
||||
};
|
||||
};
|
||||
|
||||
function isCrawlTaskState(value: unknown): value is CrawlTaskState {
|
||||
return typeof value === 'object' && value !== null && 'id' in value && 'steps' in value;
|
||||
}
|
||||
@@ -1,7 +1,8 @@
|
||||
const AUTH_TOKEN_KEY = 'token';
|
||||
const MOCK_TOKEN = 'mock-extension-token';
|
||||
const AUTH_TOKEN_KEY = 'token';
|
||||
|
||||
/** 获取当前登录 token。 */
|
||||
/**
|
||||
* 获取当前登录 token。
|
||||
*/
|
||||
export async function getToken(): Promise<string | null> {
|
||||
const storage = getChromeStorage();
|
||||
|
||||
@@ -14,13 +15,10 @@ export async function getToken(): Promise<string | null> {
|
||||
return window.localStorage.getItem(AUTH_TOKEN_KEY);
|
||||
}
|
||||
|
||||
/** 模拟登录,写入一个临时 token,方便后续替换真实登录逻辑。 */
|
||||
export async function mockLogin(): Promise<string> {
|
||||
await setToken(MOCK_TOKEN);
|
||||
return MOCK_TOKEN;
|
||||
}
|
||||
|
||||
/** 清除当前登录 token。 */
|
||||
/**
|
||||
* 清除当前登录 token。
|
||||
*/
|
||||
export async function logout(): Promise<void> {
|
||||
const storage = getChromeStorage();
|
||||
|
||||
@@ -28,11 +26,12 @@ export async function logout(): Promise<void> {
|
||||
await storage.remove(AUTH_TOKEN_KEY);
|
||||
return;
|
||||
}
|
||||
console.log("溢出")
|
||||
|
||||
window.localStorage.removeItem(AUTH_TOKEN_KEY);
|
||||
}
|
||||
|
||||
async function setToken(token: string): Promise<void> {
|
||||
export async function setToken(token: string): Promise<void> {
|
||||
const storage = getChromeStorage();
|
||||
|
||||
if (storage) {
|
||||
@@ -44,7 +43,7 @@ async function setToken(token: string): Promise<void> {
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取谷歌缓存
|
||||
* * 获取谷歌缓存
|
||||
*/
|
||||
function getChromeStorage(): chrome.storage.StorageArea | null {
|
||||
if (typeof chrome === 'undefined' || !chrome.storage?.local) {
|
||||
|
||||
27
src/shared/message.ts
Normal file
27
src/shared/message.ts
Normal file
@@ -0,0 +1,27 @@
|
||||
export type MessageAction =
|
||||
| 'GET_CRAWL_STATE'
|
||||
| 'START_CRAWL'
|
||||
| 'CANCEL_CRAWL'
|
||||
| 'RESUME_CRAWL';
|
||||
|
||||
interface BackgroundMessage<T = unknown> {
|
||||
action: MessageAction;
|
||||
payload?: T;
|
||||
}
|
||||
|
||||
interface BackgroundResponse<T = unknown> {
|
||||
ok: boolean;
|
||||
data: T | null;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a command to the background service worker.
|
||||
*/
|
||||
export function sendBackgroundMessage<T>(data: BackgroundMessage): Promise<BackgroundResponse<T>> {
|
||||
if (typeof chrome === 'undefined' || !chrome.runtime?.sendMessage) {
|
||||
return Promise.resolve({ ok: true, data: null });
|
||||
}
|
||||
|
||||
return chrome.runtime.sendMessage(data);
|
||||
}
|
||||
9
src/shared/time_format.ts
Normal file
9
src/shared/time_format.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
/**
|
||||
* 秒格式化成 00:00
|
||||
* @param totalSeconds
|
||||
*/
|
||||
export function formatSeconds(totalSeconds: number): string {
|
||||
const minutes = Math.floor(totalSeconds / 60).toString().padStart(2, '0');
|
||||
const seconds = (totalSeconds % 60).toString().padStart(2, '0');
|
||||
return `${minutes}:${seconds}`;
|
||||
}
|
||||
@@ -1,45 +1,51 @@
|
||||
/**
|
||||
* 单个爬取步骤的执行状态。
|
||||
*/
|
||||
// 单个爬取步骤的执行状态。
|
||||
export type CrawlStepStatus = 'pending' | 'running' | 'success' | 'failed';
|
||||
|
||||
/**
|
||||
* 整体爬取任务状态。
|
||||
*/
|
||||
export type CrawlTaskStatus = 'running' | 'completed' | 'failed' | 'canceled';
|
||||
// 整体爬取任务状态。
|
||||
export type CrawlTaskStatus = 'running' | 'paused' | 'completed' | 'failed' | 'canceled';
|
||||
|
||||
/**
|
||||
* 时间轴中的单个爬取步骤进度。
|
||||
*/
|
||||
// 时间轴中的单个爬取步骤进度。
|
||||
export interface CrawlProgressStep {
|
||||
/** 步骤名称,用于展示给用户。 */
|
||||
// 步骤名称,用于展示给用户。
|
||||
name: string;
|
||||
/** 步骤唯一标识,对应平台配置 steps 中的 uniqueKey。 */
|
||||
// 步骤唯一标识,对应平台配置 steps 中的 uniqueKey。
|
||||
uniqueKey: string;
|
||||
/** 当前步骤执行状态。 */
|
||||
// 当前步骤执行状态。
|
||||
status: CrawlStepStatus;
|
||||
/** 状态补充说明,如失败原因。 */
|
||||
// 状态补充说明,如失败原因。
|
||||
message?: string;
|
||||
// 当前步骤抓取到的数据结果。
|
||||
result?: unknown;
|
||||
}
|
||||
|
||||
// 爬取暂停原因,通常由登录、验证码或页面不存在触发。
|
||||
export interface CrawlPauseInfo {
|
||||
// 暂停原因编码。
|
||||
reason: 'reauth' | 'shield' | 'not_found' | 'page_not_ready';
|
||||
// 展示给用户看的处理提示。
|
||||
message: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* 当前正在执行的爬取任务快照,供 popup 和 content script 同步展示。
|
||||
*/
|
||||
export interface CrawlTaskState {
|
||||
/** 任务唯一标识。 */
|
||||
// 任务唯一标识。
|
||||
id: string;
|
||||
/** 当前爬取平台 ID。 */
|
||||
// 当前爬取平台 ID。
|
||||
platformId: string;
|
||||
/** 当前爬取平台名称。 */
|
||||
// 当前爬取平台名称。
|
||||
platformName: string;
|
||||
/** 爬取窗口 ID,由 background 创建窗口后写入。 */
|
||||
// 爬取窗口 ID,由 background 创建窗口后写入。
|
||||
windowId?: number;
|
||||
/** 任务开始时间戳。 */
|
||||
// 任务开始时间戳。
|
||||
startedAt: number;
|
||||
/** 当前任务状态。 */
|
||||
// 当前任务状态。
|
||||
status: CrawlTaskStatus;
|
||||
/** 当前执行到的步骤下标。 */
|
||||
// 暂停信息;仅 status 为 paused 时存在。
|
||||
pause?: CrawlPauseInfo;
|
||||
// 当前执行到的步骤下标。
|
||||
currentStepIndex: number;
|
||||
/** 平台 steps 映射出的时间轴进度。 */
|
||||
// 平台 steps 映射出的时间轴进度。
|
||||
steps: CrawlProgressStep[];
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
export type {
|
||||
CrawlPauseInfo,
|
||||
CrawlProgressStep,
|
||||
CrawlStepStatus,
|
||||
CrawlTaskState,
|
||||
|
||||
@@ -1,100 +1,76 @@
|
||||
/**
|
||||
* 字段采集类型:0 普通元素(默认),1 列表,2 表格(带分页)。
|
||||
*/
|
||||
// 字段采集类型:0 普通元素(默认),1 列表,2 表格(带分页)。
|
||||
export type PlatformFieldType = 0 | 1 | 2;
|
||||
|
||||
/**
|
||||
* 条件点击配置,用于进入某个页面或采集某个字段前按顺序点击页面元素。
|
||||
*/
|
||||
// 条件点击配置,用于进入某个页面或采集某个字段前按顺序点击页面元素。
|
||||
export interface PlatformClickCondition {
|
||||
/** 需要点击的元素选择器列表,会按数组顺序依次执行。 */
|
||||
// 需要点击的元素选择器列表,会按数组顺序依次执行。
|
||||
list: string[];
|
||||
/** 点击后的等待时间,单位毫秒。 */
|
||||
// 点击后的等待时间,单位毫秒。
|
||||
time: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* 分页配置,用于列表或表格字段存在翻页时控制下一页采集。
|
||||
*/
|
||||
// 分页配置,用于列表或表格字段存在翻页时控制下一页采集。
|
||||
export interface PlatformPaginationConfig {
|
||||
/** 下一页按钮的 CSS 选择器。 */
|
||||
// 下一页按钮的 CSS 选择器。
|
||||
nextBtn: string;
|
||||
/** 最多采集页数,避免无限翻页。 */
|
||||
// 最多采集页数,避免无限翻页。
|
||||
maxPage?: number;
|
||||
/** 每次翻页后的等待时间,单位毫秒。 */
|
||||
// 每次翻页后的等待时间,单位毫秒。
|
||||
delay?: number;
|
||||
/** 下一页按钮不可用时的 class 名称。 */
|
||||
// 下一页按钮不可用时的 class 名称。
|
||||
disabledClass?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* 表格分段配置,用于兼容一个数据块由多个 table 或多个 table 片段组成的情况。
|
||||
*/
|
||||
// 表格分段配置,用于兼容一个数据块由多个 table 或多个 table 片段组成的情况。
|
||||
export interface PlatformTablePartConfig {
|
||||
/** 当前 table 或表格片段的名称。 */
|
||||
label: string;
|
||||
/** 当前 table 或表格片段的兼容名称,兼容 message.js 中的 name 写法。 */
|
||||
name?: string;
|
||||
/** 当前 table 或表格片段的 CSS 选择器。 */
|
||||
className: string;
|
||||
/** 当前 table 或表格片段的兼容选择器,兼容 message.js 中的 select 写法。 */
|
||||
select?: string;
|
||||
/** 行元素选择器,不填时由采集逻辑使用默认行选择器。 */
|
||||
rowSelector?: string;
|
||||
/** 当前 table 或表格片段下需要采集的字段。 */
|
||||
keys?: PlatformFieldConfig[];
|
||||
}
|
||||
|
||||
/**
|
||||
* 页面字段配置,描述一个普通元素、列表元素或表格元素如何从 DOM 中提取数据。
|
||||
*/
|
||||
// 页面字段配置,描述一个普通元素、列表元素或表格元素如何从 DOM 中提取数据。
|
||||
export interface PlatformFieldConfig {
|
||||
/** 字段显示名,也是最终打印数据中的键名。 */
|
||||
// 字段显示名,也是最终打印数据中的键名。
|
||||
label: string;
|
||||
/** 字段对应的 CSS 选择器。 */
|
||||
// 字段对应的 CSS 选择器。
|
||||
className: string;
|
||||
/** 需要提取的属性名;不填时默认提取文本,图片和链接会自动取 src/href。 */
|
||||
// 需要提取的属性名;不填时默认提取文本,图片和链接会自动取 src/href。
|
||||
attr?: string;
|
||||
/** 表格字段所属的表格分段名称,用于横向拼接多 table 行数据。 */
|
||||
// 表格字段所属的表格分段名称,用于横向拼接多 table 行数据。
|
||||
part?: string;
|
||||
/** 字段类型:0 普通元素(默认),1 列表,2 表格。 */
|
||||
// 字段类型:0 普通元素(默认),1 列表,2 表格。
|
||||
type?: PlatformFieldType;
|
||||
/** 进入该字段采集前需要执行的点击条件。 */
|
||||
// 进入该字段采集前需要执行的点击条件。
|
||||
condition?: PlatformClickCondition;
|
||||
/** 子元素字段;普通元素下表示嵌套键值,列表或表格下表示每项/每行的字段。 */
|
||||
// 子元素字段;普通元素下表示嵌套键值,列表或表格下表示每项/每行的字段。
|
||||
keys?: PlatformFieldConfig[];
|
||||
/** 表格专用配置,用于多个 table 或分段 table 的组合采集。 */
|
||||
// 表格专用配置,用于多个 table 或分段 table 的组合采集。
|
||||
tableParts?: PlatformTablePartConfig[];
|
||||
/** 分页配置,常用于列表和表格字段。 */
|
||||
// 分页配置,常用于列表和表格字段。
|
||||
pagination?: PlatformPaginationConfig;
|
||||
}
|
||||
|
||||
/**
|
||||
* 单个抓取页面步骤配置,描述页面地址、可用性检查和需要采集的字段。
|
||||
*/
|
||||
// 单个抓取页面步骤配置,描述页面地址、可用性检查和需要采集的字段。
|
||||
export interface PlatformStepConfig {
|
||||
/** 步骤显示名,用于进度展示。 */
|
||||
// 步骤显示名,用于进度展示。
|
||||
name: string;
|
||||
/** 步骤唯一标识,用于状态记录和结果归类。 */
|
||||
// 步骤唯一标识,用于状态记录和结果归类。
|
||||
uniqueKey: string;
|
||||
/** 当前步骤需要打开或跳转到的页面地址。 */
|
||||
// 当前步骤需要打开或跳转到的页面地址。
|
||||
url: string;
|
||||
/** 判断页面 DOM 是否加载完成的 CSS 选择器。 */
|
||||
// 判断页面 DOM 是否加载完成的 CSS 选择器。
|
||||
checkSelector: string;
|
||||
/** 当前页面需要采集的字段列表。 */
|
||||
// 当前页面需要采集的字段列表。
|
||||
fields: PlatformFieldConfig[];
|
||||
/** 进入该步骤前需要执行的点击条件。 */
|
||||
// 进入该步骤前需要执行的点击条件。
|
||||
condition?: PlatformClickCondition;
|
||||
}
|
||||
|
||||
/**
|
||||
* 平台抓取配置,描述一个商家后台平台的入口地址和页面抓取顺序。
|
||||
*/
|
||||
// 平台抓取配置,描述一个商家后台平台的入口地址和页面抓取顺序。
|
||||
export interface PlatformConfig {
|
||||
/** 平台唯一标识,用于 popup 选择和后台任务定位。 */
|
||||
// 平台唯一标识,用于 popup 选择和后台任务定位。
|
||||
id: string;
|
||||
/** 平台显示名称。 */
|
||||
// 平台显示名称。
|
||||
name: string;
|
||||
/** 当前平台的页面抓取顺序。 */
|
||||
// 当前平台的页面抓取顺序。
|
||||
steps: PlatformStepConfig[];
|
||||
}
|
||||
|
||||
52
step.md
52
step.md
@@ -1,52 +0,0 @@
|
||||
# 项目结构
|
||||
```angular2html
|
||||
src:.
|
||||
├─assets # 静态资源目录
|
||||
│ vite.svg # 这里的资源通常用于图标、Logo 或扩展程序内部引用的图片
|
||||
│
|
||||
├─background # 后台脚本 (Background Script / Service Worker)
|
||||
│ index.ts # 扩展的“大脑”,常驻后台运行,处理事件监听、报文转发、存储管理等
|
||||
│
|
||||
├─config # 配置目录
|
||||
│ platforms.ts # 自定义配置,各种平台(如不同网站、不同浏览器)的适配配置
|
||||
│
|
||||
├─content # 内容脚本 (Content Script)
|
||||
│ │ App.vue # 注入到网页中的 UI 组件(通常用于在目标页面侧边栏或浮窗显示界面)
|
||||
│ │ main.ts # 内容脚本的入口文件,负责将 Vue 组件挂载到宿主页面的 DOM 中
|
||||
│ │
|
||||
│ └─views # 内容脚本相关的子视图或组件
|
||||
│
|
||||
├─options # 选项页 (Options Page)
|
||||
│ App.vue # 扩展设置页面的 UI(右键扩展图标点击“选项”打开的页面)
|
||||
│ index.html # 选项页的 HTML 宿主文件
|
||||
│ main.ts # 选项页的 Vue 入口文件
|
||||
│
|
||||
├─popup # 弹窗页 (Popup Page)
|
||||
│ App.vue # 点击扩展图标时显示的弹出框 UI
|
||||
│ index.html # 弹窗页的 HTML 宿主文件
|
||||
│ main.ts # 弹窗页的 Vue 入口文件
|
||||
│
|
||||
├─shared # 共享代码库 (Shared)
|
||||
│ # 存放被 background、content、popup 等多个模块共同引用的工具函数、常量、API封装等
|
||||
│
|
||||
└─types # 类型定义目录
|
||||
index.ts # 存放全局的 TypeScript 接口(Interface)和类型(Type)定义
|
||||
```
|
||||
|
||||
# 开发步骤
|
||||
1.在popup模块中的App.vue中用tailwindcss编写,点击扩展图标时出现的弹窗,逻辑如下
|
||||
- 在未登录情况下,即storage中token字段是否存在,如果不存在,弹窗内容只用显示扩展名字、描述、请登录按钮,底部扩展版本
|
||||
- 当点击登录按钮后,先模拟登录,写死token,之后ui如下
|
||||
- 显示扩展名字、描述、一个平台选择框(通过读取config/platforms.ts)的内容for循环显示平台、扫描按钮、最底部Row(退出按钮,扩展版本号)
|
||||
- 注意:token的存储和获取逻辑放到/shared/auth.ts中去,如果涉及到接口和枚举的定义,请判断是否是全局类型
|
||||
- 如果是,该类型写到一个新文件中,并放到types/下,如果不是,放到当前模块的types/目录下(如果没用,新建)
|
||||
|
||||
2.前提:当1完成后,点击popup的立即爬取已经可以打开一个新的窗口了
|
||||
- 在所有网页(包括新打开的窗口和所有网页)的右下角都放一个圆形正计时(表示正在爬取中)
|
||||
- 点击圆形正计时时,出现一个popup,内容如下
|
||||
- 以时间轴的形式,表示当前爬取进度,即:根据platforms.ts中的steps
|
||||
- 同时点击扩展的popup里的内容,也变得和上面的时间轴内容一致,显示爬取进度,隐藏立即爬取等按钮,
|
||||
|
||||
3.前提:1和2都已完成,ui和交互操作上ok
|
||||
- 开始爬取网页中的数据,查看message.js内容,吧里面的爬取方法都提取出来放到background/domScraper.ts中去,
|
||||
- 基于2,每次根据steps打开一个新网页后,根据它的fields数组字段,调用domScraper中的方法,来提取数据,并打印到控制台即可
|
||||
@@ -15,7 +15,7 @@
|
||||
"@crxjs/vite-plugin/client",
|
||||
"chrome"
|
||||
],
|
||||
"allowImportingTsExtensions": true,
|
||||
"allowImportingTsExtensions": false,
|
||||
"allowJs": true,
|
||||
"strict": true,
|
||||
"strictNullChecks": true,
|
||||
|
||||
@@ -1 +1 @@
|
||||
{"root":["./manifest.config.ts","./message.js","./vite.config.ts","./src/background/domscraper.ts","./src/background/index.ts","./src/background/service.ts","./src/background/types.ts","./src/background/service/crawl.ts","./src/background/service/lifecycle.ts","./src/background/service/state.ts","./src/background/service/tab.ts","./src/config/platforms.ts","./src/content/app.vue","./src/content/main.ts","./src/options/app.vue","./src/options/main.ts","./src/popup/app.vue","./src/popup/main.ts","./src/shared/auth.ts","./src/types/crawl.ts","./src/types/index.ts","./src/types/platform.ts"],"version":"5.9.3"}
|
||||
{"root":["./manifest.config.ts","./message.js","./vite.config.ts","./src/background/domscraper.ts","./src/background/index.ts","./src/background/service.ts","./src/background/types.ts","./src/background/service/crawltask.ts","./src/background/service/lifecycle.ts","./src/background/service/taskstate.ts","./src/config/platforms.ts","./src/content/app.vue","./src/content/main.ts","./src/content/pagerunner.ts","./src/options/app.vue","./src/options/main.ts","./src/popup/app.vue","./src/popup/main.ts","./src/popup/hook/use-login.ts","./src/popup/hook/use-scan.ts","./src/shared/auth.ts","./src/shared/message.ts","./src/shared/time_format.ts","./src/types/crawl.ts","./src/types/index.ts","./src/types/platform.ts"],"version":"5.9.3"}
|
||||
@@ -3,7 +3,7 @@ import {crx} from '@crxjs/vite-plugin'
|
||||
import tailwindcss from '@tailwindcss/vite'
|
||||
import vue from '@vitejs/plugin-vue'
|
||||
import {defineConfig} from 'vite'
|
||||
import manifest from './manifest.config.ts'
|
||||
import manifest from './manifest.config'
|
||||
|
||||
export default defineConfig({
|
||||
resolve: {
|
||||
|
||||
661
we.md
Normal file
661
we.md
Normal file
@@ -0,0 +1,661 @@
|
||||
# 店闪扩展执行链路说明
|
||||
|
||||
这份文档按当前项目代码整理,目的是方便顺着代码阅读整个爬取流程。
|
||||
|
||||
## 1. 扩展入口关系
|
||||
|
||||
扩展入口由 `manifest.config.ts` 配置:
|
||||
|
||||
- `action.default_popup` 指向 `src/popup/index.html`,点击浏览器插件图标后打开 popup。
|
||||
- `background.service_worker` 指向 `src/background/index.ts`,负责接收消息、创建爬取窗口、执行爬取任务。
|
||||
- `content_scripts` 指向 `src/content/main.ts`,会注入到所有 `http/https` 页面,用于右下角进度浮窗和页面 DOM 抓取。
|
||||
- `permissions` 中的 `storage`、`tabs`、`windows` 支撑状态保存、页面跳转和窗口管理。
|
||||
|
||||
## 2. 点击 popup 后发生什么
|
||||
|
||||
### 2.1 popup 初始化
|
||||
|
||||
文件:`src/popup/App.vue`
|
||||
|
||||
触发方法:
|
||||
|
||||
- `onMounted`
|
||||
- `getToken`
|
||||
- `refreshCrawlState`
|
||||
- `updateElapsedSeconds`
|
||||
|
||||
执行过程:
|
||||
|
||||
1. 用户点击扩展图标,Chrome 打开 `src/popup/index.html`。
|
||||
2. Vue 加载 `src/popup/App.vue`。
|
||||
3. `onMounted` 执行:
|
||||
- 调用 `getToken()` 读取登录状态。
|
||||
- 调用 `refreshCrawlState()` 获取当前爬取任务。
|
||||
- 启动 `setInterval`,每秒刷新爬取状态和计时。
|
||||
4. `getToken()` 来自 `src/shared/auth.ts`:
|
||||
- 优先从 `chrome.storage.local` 读取 `token`。
|
||||
- 非扩展环境下从 `localStorage` 读取。
|
||||
|
||||
### 2.2 点击“立即爬取”
|
||||
|
||||
文件:`src/popup/App.vue`
|
||||
|
||||
触发方法:
|
||||
|
||||
- `handleScan`
|
||||
- `sendBackgroundMessage`
|
||||
|
||||
执行过程:
|
||||
|
||||
1. 用户点击“立即爬取”按钮。
|
||||
2. `handleScan()` 检查当前是否选择了平台。
|
||||
3. 通过 `sendBackgroundMessage()` 发送消息给 background:
|
||||
|
||||
```ts
|
||||
{
|
||||
action: 'START_CRAWL',
|
||||
payload: { platformId: selectedPlatform.value.id },
|
||||
}
|
||||
```
|
||||
|
||||
4. `sendBackgroundMessage()` 实际调用:
|
||||
|
||||
```ts
|
||||
chrome.runtime.sendMessage(message)
|
||||
```
|
||||
|
||||
5. popup 收到 background 返回的 `CrawlTaskState` 后,更新本地的 `crawlState`,页面开始显示进度。
|
||||
|
||||
## 3. background 如何接住 popup 消息
|
||||
|
||||
### 3.1 background 消息入口
|
||||
|
||||
文件:`src/background/index.ts`
|
||||
|
||||
触发方法:
|
||||
|
||||
- `chrome.runtime.onMessage.addListener`
|
||||
- `handleBackgroundMessage`
|
||||
|
||||
执行过程:
|
||||
|
||||
1. background service worker 启动后,注册 `chrome.runtime.onMessage`。
|
||||
2. popup 发来的 `START_CRAWL` 会进入 `handleBackgroundMessage()`。
|
||||
3. `handleBackgroundMessage()` 调用:
|
||||
|
||||
```ts
|
||||
handleBackgroundCommand(message)
|
||||
```
|
||||
|
||||
4. 如果执行成功,调用 `sendResponse(result)` 把结果回给 popup。
|
||||
5. 如果执行失败,统一返回:
|
||||
|
||||
```ts
|
||||
{ ok: false, error: messageText }
|
||||
```
|
||||
|
||||
### 3.2 background 指令分发
|
||||
|
||||
文件:`src/background/service.ts`
|
||||
|
||||
作用:
|
||||
|
||||
```ts
|
||||
export {
|
||||
handleBackgroundCommand,
|
||||
handleInstalled,
|
||||
handleStartup,
|
||||
handleWindowRemoved,
|
||||
} from './service/lifecycle';
|
||||
```
|
||||
|
||||
这个文件现在只是 re-export,真正逻辑在 `src/background/service/lifecycle.ts`。
|
||||
|
||||
文件:`src/background/service/lifecycle.ts`
|
||||
|
||||
触发方法:
|
||||
|
||||
- `handleBackgroundCommand`
|
||||
|
||||
分发关系:
|
||||
|
||||
- `START_CRAWL` -> `startCrawl`
|
||||
- `GET_CRAWL_STATE` -> `getCrawlTaskState`
|
||||
- `CANCEL_CRAWL` -> `cancelCrawl`
|
||||
- `RESUME_CRAWL` -> `resumeCrawl`
|
||||
|
||||
这些消息类型定义在 `src/background/types.ts`。
|
||||
|
||||
## 4. 创建爬取任务和新窗口
|
||||
|
||||
文件:`src/background/service/crawlTask.ts`
|
||||
|
||||
触发方法:
|
||||
|
||||
- `startCrawl`
|
||||
- `createCrawlWindow`
|
||||
- `runCrawlSteps`
|
||||
|
||||
执行过程:
|
||||
|
||||
1. `startCrawl(platformId)` 先调用 `getPlatformById(platformId)`。
|
||||
2. `getPlatformById` 来自 `src/config/platforms.ts`,用于找到当前平台配置。
|
||||
3. 读取平台配置中的第一个 step:
|
||||
|
||||
```ts
|
||||
const firstStep = platform.steps[0];
|
||||
```
|
||||
|
||||
4. 创建初始任务状态 `CrawlTaskState`:
|
||||
- `id`
|
||||
- `platformId`
|
||||
- `platformName`
|
||||
- `startedAt`
|
||||
- `status: 'running'`
|
||||
- `currentStepIndex: 0`
|
||||
- `steps`
|
||||
5. 调用 `setCrawlTaskState(nextState)` 写入 `chrome.storage.local`。
|
||||
6. 调用 `createCrawlWindow(firstStep.url)` 打开新的普通浏览器窗口。
|
||||
7. 窗口创建成功后,把 `windowId` 写回任务状态。
|
||||
8. 调用:
|
||||
|
||||
```ts
|
||||
void runCrawlSteps(platform, stateWithWindow);
|
||||
```
|
||||
|
||||
这里使用 `void`,表示后台任务异步继续跑;`startCrawl` 会先把初始状态返回给 popup。
|
||||
|
||||
## 5. 爬取状态保存在哪里
|
||||
|
||||
文件:`src/background/service/taskState.ts`
|
||||
|
||||
核心方法:
|
||||
|
||||
- `getCrawlTaskState`
|
||||
- `setCrawlTaskState`
|
||||
- `updateCrawlTaskState`
|
||||
|
||||
保存位置:
|
||||
|
||||
```ts
|
||||
chrome.storage.local
|
||||
```
|
||||
|
||||
保存 key:
|
||||
|
||||
```ts
|
||||
crawlTaskState
|
||||
```
|
||||
|
||||
当前项目没有把爬取结果单独保存到数据库、文件或独立 result key。每一步的结果直接保存在:
|
||||
|
||||
```ts
|
||||
CrawlTaskState.steps[index].result
|
||||
```
|
||||
|
||||
也就是说,popup 和 content 看到的进度、暂停信息、最终结果,都来自 `chrome.storage.local` 中的 `crawlTaskState`。
|
||||
|
||||
## 6. background 如何逐步爬取页面
|
||||
|
||||
文件:`src/background/service/crawlTask.ts`
|
||||
|
||||
核心方法:
|
||||
|
||||
- `runCrawlSteps`
|
||||
- `getWindowActiveTabId`
|
||||
- `waitForTabLoaded`
|
||||
- `scrapeStepInContent`
|
||||
- `sendPageRunnerMessage`
|
||||
- `pauseForInterrupt`
|
||||
- `waitUntilResumed`
|
||||
|
||||
执行过程:
|
||||
|
||||
1. `runCrawlSteps(platform, initialState)` 按 `platform.steps` 顺序循环。
|
||||
2. 每进入一个 step,会调用 `updateCrawlTaskState`:
|
||||
- 更新 `currentStepIndex`。
|
||||
- 把当前 step 标记为 `running`。
|
||||
- 清空当前 step 的旧 message。
|
||||
3. 调用 `getWindowActiveTabId(windowId)` 找到爬取窗口里的当前 tab。
|
||||
4. 调用:
|
||||
|
||||
```ts
|
||||
chrome.tabs.update(tabId, { url: step.url, active: true })
|
||||
```
|
||||
|
||||
跳转到当前 step 配置的页面地址。
|
||||
|
||||
5. 调用 `waitForTabLoaded(tabId)` 等待 Chrome 的 tab 状态变成 `complete`。
|
||||
6. 调用 `scrapeStepInContent(tabId, step)`,让目标页面里的 content script 开始检查页面和抓取 DOM。
|
||||
|
||||
## 7. background 怎么通知目标网页抓取
|
||||
|
||||
文件:`src/background/service/crawlTask.ts`
|
||||
|
||||
触发方法:
|
||||
|
||||
- `scrapeStepInContent`
|
||||
- `sendPageRunnerMessage`
|
||||
|
||||
发送消息:
|
||||
|
||||
```ts
|
||||
{
|
||||
action: 'SCRAPE_STEP',
|
||||
payload: {
|
||||
fields: step.fields,
|
||||
checkSelector: step.checkSelector,
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
实际调用:
|
||||
|
||||
```ts
|
||||
chrome.tabs.sendMessage(tabId, message)
|
||||
```
|
||||
|
||||
这里不是 popup 发给 background,而是 background 发给目标网页 tab 里的 content script。
|
||||
|
||||
`scrapeStepInContent` 还有一个容错逻辑:如果 content script 还没注入完成,出现类似 `Could not establish connection. Receiving end does not exist.` 的错误,会在 20 秒内每 500ms 重试一次。
|
||||
|
||||
## 8. content script 如何接住抓取消息
|
||||
|
||||
### 8.1 content script 挂载
|
||||
|
||||
文件:`src/content/main.ts`
|
||||
|
||||
触发方法:
|
||||
|
||||
- `mountApp`
|
||||
- `setupPageRunner`
|
||||
|
||||
执行过程:
|
||||
|
||||
1. 目标页面加载时,Chrome 根据 `manifest.config.ts` 注入 `src/content/main.ts`。
|
||||
2. `main.ts` 等待 DOM 可用后执行 `mountApp()`。
|
||||
3. `mountApp()` 创建 `#dianshan-crx-root` 容器。
|
||||
4. 挂载 `src/content/App.vue`,用于显示右下角爬取计时按钮和进度面板。
|
||||
5. 调用 `setupPageRunner()` 注册页面消息监听器。
|
||||
|
||||
### 8.2 页面执行器接收 SCRAPE_STEP
|
||||
|
||||
文件:`src/content/pageRunner.ts`
|
||||
|
||||
触发方法:
|
||||
|
||||
- `setupPageRunner`
|
||||
- `handlePageRunnerMessage`
|
||||
- `detectPageInterrupt`
|
||||
- `waitForStableSelector`
|
||||
- `processFields`
|
||||
|
||||
执行过程:
|
||||
|
||||
1. `setupPageRunner()` 注册:
|
||||
|
||||
```ts
|
||||
chrome.runtime.onMessage.addListener(...)
|
||||
```
|
||||
|
||||
2. background 发送 `SCRAPE_STEP` 后,进入 `handlePageRunnerMessage()`。
|
||||
3. 先调用 `detectPageInterrupt()` 判断是否遇到:
|
||||
- 登录页:`reauth`
|
||||
- 验证码或风控:`shield`
|
||||
- 页面不存在:`not_found`
|
||||
4. 如果检测到中断,返回:
|
||||
|
||||
```ts
|
||||
{ ok: false, interrupt }
|
||||
```
|
||||
|
||||
5. 如果没有中断,调用 `waitForStableSelector(checkSelector, 18000)`。
|
||||
6. `checkSelector` 来自 `src/config/platforms.ts` 的 step 配置,用于判断页面关键 DOM 是否出现并可见。
|
||||
7. 如果 18 秒内关键 DOM 仍未稳定出现,返回 `page_not_ready` 中断。
|
||||
8. 如果页面可用,调用:
|
||||
|
||||
```ts
|
||||
processFields(message.payload.fields, document.body)
|
||||
```
|
||||
|
||||
开始真正的 DOM 字段采集。
|
||||
|
||||
## 9. DOM 数据怎么提取
|
||||
|
||||
文件:`src/background/domScraper.ts`
|
||||
|
||||
虽然文件在 `background` 目录下,但它被 `src/content/pageRunner.ts` import 后,实际是在目标网页的 content script 环境里执行 DOM 查询。
|
||||
|
||||
核心方法:
|
||||
|
||||
- `processFields`
|
||||
- `processList`
|
||||
- `processTable`
|
||||
- `autoClick`
|
||||
- `extractValue`
|
||||
- `waitForElement`
|
||||
|
||||
执行逻辑:
|
||||
|
||||
1. `processFields(columns, rootDom)` 遍历当前 step 的 `fields`。
|
||||
2. 每个字段先执行 `autoClick(item, rootDom)`:
|
||||
- 如果字段配置了 `condition.list`,就按顺序点击对应选择器。
|
||||
- 每次点击后等待 `condition.time`。
|
||||
3. 再根据 `item.className` 查询元素。
|
||||
4. 普通字段:
|
||||
- 没有 `keys` 时,调用 `extractValue(element, item)`。
|
||||
- 有 `keys` 时,递归调用 `processFields(item.keys, element)`。
|
||||
5. 列表字段:
|
||||
- `type === 1` 时进入 `processList`。
|
||||
- 按 `config.className` 找到列表项。
|
||||
- 对每个列表项递归执行 `processFields(config.keys, element)`。
|
||||
- 如果配置了 `pagination`,会点击下一页继续采集。
|
||||
6. 表格字段:
|
||||
- `type === 2` 时进入 `processTable`。
|
||||
- 按 `tableParts` 找到不同 table 片段。
|
||||
- 以第一个 part 的行数为准,按行拼接不同 part 的字段。
|
||||
- 如果配置了 `pagination`,会点击下一页继续采集。
|
||||
7. `extractValue` 的取值规则:
|
||||
- 配置了 `attr` 就取指定属性。
|
||||
- `IMG` 默认取 `src`。
|
||||
- `A` 默认取 `href`,相对路径会拼上当前 origin。
|
||||
- 其他元素默认取 `textContent`。
|
||||
|
||||
## 10. 采集结果怎么回传和保存
|
||||
|
||||
数据流向:
|
||||
|
||||
1. `processFields` 返回当前 step 的 DOM 采集结果。
|
||||
2. `src/content/pageRunner.ts` 返回:
|
||||
|
||||
```ts
|
||||
{ ok: true, data }
|
||||
```
|
||||
|
||||
3. `src/background/service/crawlTask.ts` 的 `scrapeStepInContent` 接到 response。
|
||||
4. `runCrawlSteps` 调用 `updateCrawlTaskState`:
|
||||
|
||||
```ts
|
||||
steps[index].result = response.data
|
||||
steps[index].status = 'success'
|
||||
```
|
||||
|
||||
5. 最新任务状态写回 `chrome.storage.local` 的 `crawlTaskState`。
|
||||
6. popup 和 content 浮窗每秒发送 `GET_CRAWL_STATE`,读取到最新结果并展示。
|
||||
|
||||
所以当前项目的数据传递是:
|
||||
|
||||
```text
|
||||
目标网页 DOM
|
||||
-> content/pageRunner.ts
|
||||
-> background/crawlTask.ts
|
||||
-> chrome.storage.local:crawlTaskState
|
||||
-> popup/App.vue 和 content/App.vue 轮询展示
|
||||
```
|
||||
|
||||
## 11. popup 进度如何刷新
|
||||
|
||||
文件:`src/popup/App.vue`
|
||||
|
||||
触发方法:
|
||||
|
||||
- `refreshCrawlState`
|
||||
- `sendBackgroundMessage`
|
||||
- `updateElapsedSeconds`
|
||||
|
||||
执行过程:
|
||||
|
||||
1. popup 打开后每秒调用一次 `refreshCrawlState()`。
|
||||
2. `refreshCrawlState()` 发送:
|
||||
|
||||
```ts
|
||||
{ action: 'GET_CRAWL_STATE' }
|
||||
```
|
||||
|
||||
3. background 的 `handleBackgroundCommand` 收到后调用 `getCrawlTaskState()`。
|
||||
4. popup 拿到 `CrawlTaskState` 后:
|
||||
- 展示平台名。
|
||||
- 展示运行时间。
|
||||
- 展示每个 step 的状态。
|
||||
- 如果 step 有 `result`,用 `JSON.stringify(step.result, null, 2)` 打印出来。
|
||||
|
||||
## 12. 目标网页右下角按钮如何刷新
|
||||
|
||||
文件:`src/content/App.vue`
|
||||
|
||||
触发方法:
|
||||
|
||||
- `onMounted`
|
||||
- `refreshCrawlState`
|
||||
- `updateElapsedSeconds`
|
||||
- `handleResumeCrawl`
|
||||
|
||||
执行过程:
|
||||
|
||||
1. content script 挂载后,`src/content/App.vue` 每秒调用 `refreshCrawlState()`。
|
||||
2. 它同样发送:
|
||||
|
||||
```ts
|
||||
{ action: 'GET_CRAWL_STATE' }
|
||||
```
|
||||
|
||||
3. 如果任务状态是 `running` 或 `paused`,显示右下角计时按钮。
|
||||
4. 点击按钮会展开进度面板。
|
||||
5. 如果任务是 `paused`,面板里显示暂停原因,并提供“我已处理,继续”按钮。
|
||||
6. 点击继续时,调用 `handleResumeCrawl()`,发送:
|
||||
|
||||
```ts
|
||||
{ action: 'RESUME_CRAWL' }
|
||||
```
|
||||
|
||||
## 13. 暂停和继续流程
|
||||
|
||||
触发原因主要来自 `src/content/pageRunner.ts`:
|
||||
|
||||
- `isLoginPage()` 检测到登录页。
|
||||
- `isShieldPage()` 检测到验证码或风控。
|
||||
- `isNotFoundPage()` 检测到页面不存在。
|
||||
- `waitForStableSelector()` 超时,认为页面关键内容没准备好。
|
||||
|
||||
流程:
|
||||
|
||||
1. content 返回:
|
||||
|
||||
```ts
|
||||
{ ok: false, interrupt }
|
||||
```
|
||||
|
||||
2. background 的 `runCrawlSteps` 检测到 `response.interrupt`。
|
||||
3. 调用 `pauseForInterrupt(taskId, stepIndex, interrupt)`。
|
||||
4. `pauseForInterrupt` 把状态写成:
|
||||
- `status: 'paused'`
|
||||
- `pause: interrupt`
|
||||
- 当前 step 保持 `running`
|
||||
- 当前 step 的 `message` 设置为中断提示
|
||||
5. popup 和 content 每秒轮询到 paused 状态后展示提示。
|
||||
6. 用户处理完登录或验证码后,点击继续。
|
||||
7. popup 或 content 发送 `RESUME_CRAWL`。
|
||||
8. background 调用 `resumeCrawl()`:
|
||||
- `status` 改回 `running`
|
||||
- 清空 `pause`
|
||||
- 当前 step 的 `message` 清空
|
||||
9. `runCrawlSteps` 中的 `waitUntilResumed()` 发现状态恢复为 `running`,重新执行当前 step。
|
||||
|
||||
## 14. 取消和关闭窗口流程
|
||||
|
||||
### 14.1 用户点击取消
|
||||
|
||||
文件:
|
||||
|
||||
- `src/popup/App.vue`
|
||||
- `src/background/service/lifecycle.ts`
|
||||
- `src/background/service/crawlTask.ts`
|
||||
|
||||
方法:
|
||||
|
||||
- `handleCancelCrawl`
|
||||
- `handleBackgroundCommand`
|
||||
- `cancelCrawl`
|
||||
|
||||
流程:
|
||||
|
||||
1. popup 发送:
|
||||
|
||||
```ts
|
||||
{ action: 'CANCEL_CRAWL' }
|
||||
```
|
||||
|
||||
2. background 分发到 `cancelCrawl()`。
|
||||
3. `cancelCrawl()` 把当前任务状态改成 `canceled`。
|
||||
4. 当前 step 被标记为 `failed`,message 为 `用户已取消`。
|
||||
5. 如果存在 `windowId`,调用 `chrome.windows.remove(windowId)` 关闭爬取窗口。
|
||||
|
||||
### 14.2 用户直接关闭爬取窗口
|
||||
|
||||
文件:
|
||||
|
||||
- `src/background/index.ts`
|
||||
- `src/background/service/lifecycle.ts`
|
||||
- `src/background/service/crawlTask.ts`
|
||||
|
||||
方法:
|
||||
|
||||
- `chrome.windows.onRemoved.addListener`
|
||||
- `handleWindowRemoved`
|
||||
- `cancelCrawlWhenWindowRemoved`
|
||||
|
||||
流程:
|
||||
|
||||
1. Chrome 触发 `windows.onRemoved`。
|
||||
2. `handleWindowRemoved(windowId)` 被调用。
|
||||
3. `cancelCrawlWhenWindowRemoved(windowId)` 检查关闭的是否是当前爬取窗口。
|
||||
4. 如果匹配且任务还在 running,就把任务改成 `canceled`。
|
||||
5. 当前 step 标记为 `failed`,message 为 `爬取窗口已关闭`。
|
||||
|
||||
## 15. 平台配置如何驱动爬取
|
||||
|
||||
文件:`src/config/platforms.ts`
|
||||
|
||||
核心导出:
|
||||
|
||||
- `PLATFORM_CONFIGS`
|
||||
- `getPlatformById`
|
||||
|
||||
平台配置结构来自 `src/types/platform.ts`:
|
||||
|
||||
- `PlatformConfig`
|
||||
- `PlatformStepConfig`
|
||||
- `PlatformFieldConfig`
|
||||
- `PlatformPaginationConfig`
|
||||
- `PlatformTablePartConfig`
|
||||
- `PlatformClickCondition`
|
||||
|
||||
关键字段:
|
||||
|
||||
- `steps`:平台要按顺序爬取的页面。
|
||||
- `step.url`:当前步骤要打开的页面地址。
|
||||
- `step.checkSelector`:判断页面是否可抓取的关键元素。
|
||||
- `step.fields`:当前页面要采集的字段。
|
||||
- `field.className`:字段选择器。
|
||||
- `field.keys`:子字段,支持递归。
|
||||
- `field.type`:字段类型,默认普通字段,`1` 是列表,`2` 是表格。
|
||||
- `field.condition`:采集字段前要自动点击的元素。
|
||||
- `field.pagination`:列表或表格翻页配置。
|
||||
- `field.tableParts`:表格分段配置,用来拼接多个 table 片段。
|
||||
|
||||
## 16. 类型结构在哪里看
|
||||
|
||||
主要类型文件:
|
||||
|
||||
- `src/background/types.ts`:background 接收的消息类型和统一响应类型。
|
||||
- `src/types/crawl.ts`:爬取任务状态、步骤状态、暂停原因。
|
||||
- `src/types/platform.ts`:平台配置、字段配置、分页配置、表格配置。
|
||||
- `src/types/index.ts`:统一导出类型。
|
||||
|
||||
最关键的运行时状态是 `CrawlTaskState`:
|
||||
|
||||
```ts
|
||||
interface CrawlTaskState {
|
||||
id: string;
|
||||
platformId: string;
|
||||
platformName: string;
|
||||
windowId?: number;
|
||||
startedAt: number;
|
||||
status: 'running' | 'paused' | 'completed' | 'failed' | 'canceled';
|
||||
pause?: CrawlPauseInfo;
|
||||
currentStepIndex: number;
|
||||
steps: CrawlProgressStep[];
|
||||
}
|
||||
```
|
||||
|
||||
每个 step 的结果保存在:
|
||||
|
||||
```ts
|
||||
interface CrawlProgressStep {
|
||||
name: string;
|
||||
uniqueKey: string;
|
||||
status: 'pending' | 'running' | 'success' | 'failed';
|
||||
message?: string;
|
||||
result?: unknown;
|
||||
}
|
||||
```
|
||||
|
||||
## 17. 整体数据流图
|
||||
|
||||
```text
|
||||
用户点击插件图标
|
||||
-> src/popup/App.vue:onMounted
|
||||
-> 读取 token 和 crawlTaskState
|
||||
|
||||
用户点击立即爬取
|
||||
-> src/popup/App.vue:handleScan
|
||||
-> chrome.runtime.sendMessage({ action: 'START_CRAWL' })
|
||||
|
||||
background 接收消息
|
||||
-> src/background/index.ts:handleBackgroundMessage
|
||||
-> src/background/service/lifecycle.ts:handleBackgroundCommand
|
||||
-> src/background/service/crawlTask.ts:startCrawl
|
||||
|
||||
创建任务和窗口
|
||||
-> src/background/service/taskState.ts:setCrawlTaskState
|
||||
-> src/background/service/crawlTask.ts:createCrawlWindow
|
||||
-> src/background/service/crawlTask.ts:runCrawlSteps
|
||||
|
||||
逐个页面跳转
|
||||
-> chrome.tabs.update(tabId, { url: step.url })
|
||||
-> src/background/service/crawlTask.ts:waitForTabLoaded
|
||||
-> src/background/service/crawlTask.ts:scrapeStepInContent
|
||||
|
||||
目标页面执行抓取
|
||||
-> chrome.tabs.sendMessage({ action: 'SCRAPE_STEP' })
|
||||
-> src/content/pageRunner.ts:handlePageRunnerMessage
|
||||
-> src/content/pageRunner.ts:waitForStableSelector
|
||||
-> src/background/domScraper.ts:processFields
|
||||
|
||||
结果回到 background
|
||||
-> src/background/service/crawlTask.ts:runCrawlSteps
|
||||
-> src/background/service/taskState.ts:updateCrawlTaskState
|
||||
-> 写入 chrome.storage.local:crawlTaskState
|
||||
|
||||
多端展示
|
||||
-> src/popup/App.vue 每秒 GET_CRAWL_STATE
|
||||
-> src/content/App.vue 每秒 GET_CRAWL_STATE
|
||||
-> popup 展示完整进度和结果
|
||||
-> 目标网页右下角展示计时按钮和进度面板
|
||||
```
|
||||
|
||||
## 18. 推荐阅读代码顺序
|
||||
|
||||
如果要按最顺的方式读代码,可以这样看:
|
||||
|
||||
1. `manifest.config.ts`:先看扩展入口。
|
||||
2. `src/popup/App.vue`:看用户点按钮时发什么消息。
|
||||
3. `src/background/index.ts`:看 background 怎么接消息。
|
||||
4. `src/background/service/lifecycle.ts`:看消息怎么分发。
|
||||
5. `src/background/service/crawlTask.ts`:看任务创建、窗口打开、页面跳转、暂停恢复。
|
||||
6. `src/background/service/taskState.ts`:看状态怎么保存。
|
||||
7. `src/content/main.ts`:看 content script 怎么挂载。
|
||||
8. `src/content/pageRunner.ts`:看目标页面怎么接收 background 抓取指令。
|
||||
9. `src/background/domScraper.ts`:看 DOM 字段怎么递归提取。
|
||||
10. `src/config/platforms.ts`:结合抓取逻辑看配置如何控制页面和字段。
|
||||
11. `src/types/crawl.ts`、`src/types/platform.ts`:最后看数据结构。
|
||||
Reference in New Issue
Block a user