diff --git a/.gitignore b/.gitignore index a547bf3..65d9b64 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ lerna-debug.log* node_modules dist +storeai-extension-v0.1.0 dist-ssr *.local diff --git a/README.md b/README.md index 2d0b0e1..801669a 100644 --- a/README.md +++ b/README.md @@ -19,5 +19,3 @@ 7.在窗口中记得显示一个取消按钮,点击后关闭窗口,取消爬取 -# 具体代码实现流程 -请阅读./step.md文档,并严格按照步骤进行执行 \ No newline at end of file diff --git a/src/background/domScraper.ts b/src/background/domScraper.ts index e558c87..182423e 100644 --- a/src/background/domScraper.ts +++ b/src/background/domScraper.ts @@ -69,7 +69,7 @@ async function autoClick(config: PlatformFieldConfig, rootDom: ParentNode): Prom /** * 递归处理字段配置,支持普通字段、嵌套 row、列表和表格。 */ -async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode): Promise { +export async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode): Promise { const result: DomScrapeResult = {}; for (const item of columns) { diff --git a/src/background/service/crawlTask.ts b/src/background/service/crawlTask.ts index 5b0f1f0..cf6ed73 100644 --- a/src/background/service/crawlTask.ts +++ b/src/background/service/crawlTask.ts @@ -1,9 +1,16 @@ import { getPlatformById } from '@/config/platforms'; -import type { CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types'; -import { scrapeDomFields, type DomScrapeResult } from '../domScraper'; +import type { CrawlPauseInfo, CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types'; +import type { DomScrapeResult } from '../domScraper'; import type { CrawlStateResponse } from '../types'; import { getCrawlTaskState, setCrawlTaskState, updateCrawlTaskState } from './taskState'; +interface PageRunnerResponse { + ok: boolean; + data?: DomScrapeResult | null; + interrupt?: CrawlPauseInfo; + error?: string; +} + /** * 创建新的爬取任务,打开目标平台窗口,并把初始时间轴状态写入 storage。 */ @@ -86,6 +93,29 @@ export async function cancelCrawl(): Promise { return { ok: true, data: canceledState }; } +/** + * 用户处理完登录、验证码或风控后,恢复当前暂停中的爬取任务。 + */ +export async function resumeCrawl(): Promise { + const state = await getCrawlTaskState(); + + if (!state || state.status !== 'paused') { + return { ok: true, data: state }; + } + + const resumedState: CrawlTaskState = { + ...state, + status: 'running', + pause: undefined, + steps: state.steps.map((step, index) => + index === state.currentStepIndex ? { ...step, status: 'running', message: undefined } : step, + ), + }; + + await setCrawlTaskState(resumedState); + return { ok: true, data: resumedState }; +} + /** * 窗口关闭后,如果关闭的是爬取窗口,就把当前任务标记为取消。 */ @@ -114,53 +144,81 @@ async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskSt } try { - const tabId = await getWindowActiveTabId(initialState.windowId); - for (let stepIndex = 0; stepIndex < platform.steps.length; stepIndex += 1) { const step = platform.steps[stepIndex]; - const currentState = await getCrawlTaskState(); + let shouldRetryStep = true; - if (currentState?.id !== initialState.id || currentState.status !== 'running') { - return; - } + while (shouldRetryStep) { + const currentState = await getCrawlTaskState(); - await updateCrawlTaskState(initialState.id, (state) => ({ - ...state, - currentStepIndex: stepIndex, - status: 'running', - steps: state.steps.map((item, index) => ({ - ...item, - status: index === stepIndex ? 'running' : item.status, - message: index === stepIndex ? undefined : item.message, - })), - })); + if (currentState?.id !== initialState.id || currentState.status === 'canceled') { + return; + } - await chrome.tabs.update(tabId, { url: step.url, active: true }); - await waitForTabLoaded(tabId); + if (currentState.status === 'paused') { + const resumed = await waitUntilResumed(initialState.id); - const isReady = await waitForStepReady(tabId, step); + if (!resumed) { + return; + } + } - if (!isReady) { await updateCrawlTaskState(initialState.id, (state) => ({ ...state, - status: 'failed', currentStepIndex: stepIndex, + status: 'running', + pause: undefined, + steps: state.steps.map((item, index) => ({ + ...item, + status: index === stepIndex ? 'running' : item.status, + message: index === stepIndex ? undefined : item.message, + })), + })); + + const tabId = await getWindowActiveTabId(initialState.windowId); + await chrome.tabs.update(tabId, { url: step.url, active: true }); + await waitForTabLoaded(tabId); + + const response = await scrapeStepInContent(tabId, step); + + if (response.interrupt) { + await pauseForInterrupt(initialState.id, stepIndex, response.interrupt); + const resumed = await waitUntilResumed(initialState.id); + + if (!resumed) { + return; + } + + continue; + } + + if (!response.ok) { + const message = response.error ?? '页面抓取失败'; + + await updateCrawlTaskState(initialState.id, (state) => ({ + ...state, + status: 'failed', + currentStepIndex: stepIndex, + steps: state.steps.map((item, index) => + index === stepIndex ? { ...item, status: 'failed', message } : item, + ), + })); + return; + } + + console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, response.data); + + await updateCrawlTaskState(initialState.id, (state) => ({ + ...state, steps: state.steps.map((item, index) => - index === stepIndex ? { ...item, status: 'failed', message: '页面关键 DOM 未加载完成' } : item, + index === stepIndex + ? { ...item, status: 'success', message: undefined, result: response.data } + : item, ), })); - return; + + shouldRetryStep = false; } - - const data = await scrapeStepFields(tabId, step); - console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, data); - - await updateCrawlTaskState(initialState.id, (state) => ({ - ...state, - steps: state.steps.map((item, index) => - index === stepIndex ? { ...item, status: 'success', message: undefined } : item, - ), - })); } await updateCrawlTaskState(initialState.id, (state) => ({ @@ -198,55 +256,90 @@ async function getWindowActiveTabId(windowId: number): Promise { } /** - * 等待步骤配置中的 checkSelector 出现;第一次超时后刷新页面再重试一次。 + * 让 content script 直接在目标页面执行检查和抓取。 */ -async function waitForStepReady(tabId: number, step: PlatformStepConfig): Promise { - if (await waitForSelector(tabId, step.checkSelector, 5000)) { - return true; +async function scrapeStepInContent(tabId: number, step: PlatformStepConfig): Promise { + const startedAt = Date.now(); + + while (Date.now() - startedAt < 20000) { + const response = await sendPageRunnerMessage(tabId, { + action: 'SCRAPE_STEP', + payload: { + fields: step.fields, + checkSelector: step.checkSelector, + }, + }); + + if (response.ok || response.interrupt || !isPageRunnerNotReadyError(response.error)) { + return response; + } + + await sleep(500); } - await chrome.tabs.reload(tabId); - await waitForTabLoaded(tabId); - - return waitForSelector(tabId, step.checkSelector, 5000); + return { ok: false, error: '页面脚本未响应,请刷新扩展后重试' }; } /** - * 在目标页面轮询检查指定 selector 是否存在。 + * 给目标页的 content script 发送页面执行消息。 */ -async function waitForSelector(tabId: number, selector: string, timeoutMs: number): Promise { - const startedAt = Date.now(); +async function sendPageRunnerMessage(tabId: number, message: unknown): Promise { + try { + const response = await chrome.tabs.sendMessage(tabId, message); - while (Date.now() - startedAt < timeoutMs) { - const results = await chrome.scripting.executeScript({ - target: { tabId }, - func: (targetSelector: string) => Boolean(document.querySelector(targetSelector)), - args: [selector], - }); + if (response && typeof response === 'object') { + return response as PageRunnerResponse; + } - if (Boolean(results[0]?.result)) { + return { ok: false, error: '页面脚本返回为空' }; + } catch (error: unknown) { + return { ok: false, error: error instanceof Error ? error.message : String(error) }; + } +} + +/** + * 判断错误是否只是 content script 尚未注入完成。 + */ +function isPageRunnerNotReadyError(error?: string): boolean { + if (!error) { + return false; + } + + return /receiving end does not exist|could not establish connection|no receiving end/i.test(error); +} + +/** + * 因登录、验证码或页面异常暂停当前任务。 + */ +async function pauseForInterrupt(taskId: string, stepIndex: number, interrupt: CrawlPauseInfo): Promise { + await updateCrawlTaskState(taskId, (state) => ({ + ...state, + status: 'paused', + pause: interrupt, + currentStepIndex: stepIndex, + steps: state.steps.map((step, index) => + index === stepIndex ? { ...step, status: 'running', message: interrupt.message } : step, + ), + })); +} + +/** + * 暂停后等待用户点继续或取消。 + */ +async function waitUntilResumed(taskId: string): Promise { + while (true) { + const state = await getCrawlTaskState(); + + if (!state || state.id !== taskId || state.status === 'canceled' || state.status === 'failed') { + return false; + } + + if (state.status === 'running') { return true; } - await new Promise((resolve) => { - globalThis.setTimeout(resolve, 500); - }); + await sleep(1000); } - - return false; -} - -/** - * 注入 domScraper 到目标页面,并根据当前 step.fields 提取页面数据。 - */ -async function scrapeStepFields(tabId: number, step: PlatformStepConfig): Promise { - const results = await chrome.scripting.executeScript({ - target: { tabId }, - func: scrapeDomFields, - args: [step.fields], - }); - - return results[0]?.result ?? null; } /** @@ -257,7 +350,7 @@ function createCrawlWindow(url: string): Promise { chrome.windows.create( { url, - type: 'popup', + type: 'normal', focused: true, width: 1280, height: 900, @@ -302,3 +395,12 @@ function waitForTabLoaded(tabId: number): Promise { chrome.tabs.onUpdated.addListener(handleUpdated); }); } + +/** + * 简单等待工具。 + */ +function sleep(ms: number): Promise { + return new Promise((resolve) => { + globalThis.setTimeout(resolve, ms); + }); +} diff --git a/src/background/service/lifecycle.ts b/src/background/service/lifecycle.ts index 55bda5f..e62b373 100644 --- a/src/background/service/lifecycle.ts +++ b/src/background/service/lifecycle.ts @@ -1,5 +1,5 @@ import type { BackgroundCommand, BackgroundResponse, CrawlStateResponse } from '../types'; -import { cancelCrawl, cancelCrawlWhenWindowRemoved, startCrawl } from './crawlTask'; +import { cancelCrawl, cancelCrawlWhenWindowRemoved, resumeCrawl, startCrawl } from './crawlTask'; import { getCrawlTaskState } from './taskState'; /** @@ -37,6 +37,8 @@ export async function handleBackgroundCommand( return { ok: true, data: await getCrawlTaskState() }; case 'CANCEL_CRAWL': return cancelCrawl(); + case 'RESUME_CRAWL': + return resumeCrawl(); default: return { ok: false, error: '未知的后台指令' }; } diff --git a/src/background/types.ts b/src/background/types.ts index 0452880..097308c 100644 --- a/src/background/types.ts +++ b/src/background/types.ts @@ -23,8 +23,14 @@ export interface CancelCrawlCommand { action: 'CANCEL_CRAWL'; } +// 继续当前暂停中的爬取任务。 +export interface ResumeCrawlCommand { + // 消息动作类型:用户已处理登录/验证码,允许 background 继续重试当前步骤。 + action: 'RESUME_CRAWL'; +} + // popup/content script 能发送给 background 的全部消息类型。 -export type BackgroundCommand = StartCrawlCommand | GetCrawlStateCommand | CancelCrawlCommand; +export type BackgroundCommand = StartCrawlCommand | GetCrawlStateCommand | CancelCrawlCommand | ResumeCrawlCommand; // background 统一响应结构。 export interface BackgroundResponse { diff --git a/src/config/platforms.ts b/src/config/platforms.ts index b221b4f..f60bc0f 100644 --- a/src/config/platforms.ts +++ b/src/config/platforms.ts @@ -10,7 +10,7 @@ export const PLATFORM_CONFIGS: PlatformConfig[] = [ name: '数据看板', uniqueKey: 'databoard', url: 'https://seller.shopee.com.my/', - checkSelector: '.rate-manager-content', + checkSelector: '.page-container', fields: [ { label: "出货统计", @@ -119,6 +119,191 @@ export const PLATFORM_CONFIGS: PlatformConfig[] = [ }, ], }, + { + name: "广告中心", + uniqueKey: "adscenter", + url: "https://seller.shopee.com.my/portal/marketing/pas/index", + checkSelector: '.page-container', + fields: [ + { + label: "我的账户", + className: ".my-account-wrap", + keys: [ + { + label: "广告余额", + className: ".credit-expense-label-wrapper:nth-child(1) .ellipsis-content" + }, + { + label: "今日广告花费", + className: ".credit-expense-label-wrapper:nth-child(2) .ellipsis-content" + }, + + ] + }, + { + label: "进行中广告列表", + className: ".eds-table__body-container", + type: 2, + tableParts: [ + { name: "fixed", select: ".eds-table__fix-body" }, + { name: "main", select: ".eds-table__main-body" } + ], + keys: [ + { + label: "广告信息", + className: ".info-containter", + part: "fixed", + keys: [ + { + label: "广告名称", + className: ".campaign-name-container" + }, + { + label: "广告类型", + className: ".gmv-max-noti" + }, + { + label: "结束时间", + className: ".time-edit-wrapper" + } + ] + }, + { + label: "每日预算", + part: "main", + className: "td:nth-child(1)" + }, + { + label: "目标ROAS", + part: "main", + className: "td:nth-child(2)" + }, + { + label: "花费", + part: "main", + className: "td:nth-child(4)" + }, + { + label: "销售额", + part: "main", + className: "td:nth-child(5)" + }, + { + label: "广告支出回报率", + part: "main", + className: "td:nth-child(6)" + } + ], + pagination: { + nextBtn: ".eds-pager__button-next", // 下一页按钮 + disabledClass: ".eds-button--disabled", // 按钮禁用时的class(用来判断结束) + maxPage: 1, // 最大爬取页数 + delay: 2000 // 翻页后的等待加载时间 + }, + + } + ] + }, + { + name: "评论管理", + uniqueKey: "message", + url: "https://seller.shopee.com.my/portal/settings/shop/rating", + checkSelector: '.page-container', + fields: [ + { + label: "低星评论", + className: ".border-solid.rounded", + condition: { + list: [ + ".flex.items-center.mt-6 div:nth-child(3)", + ".eds-react-checkbox-group label:nth-child(2)", + ".eds-react-checkbox-group label:nth-child(3)", + ".eds-react-checkbox-group label:nth-child(4)" + ], + time: 200, + }, + type: 1, + keys: [ + { + label: "用户", + className: ".flex.items-center.justify-start .ml-2" + }, + { + label: "订单编号", + className: ".underline.px-1" + }, + { + label: "商品名称", + className: ".min-w-0.font-medium.break-all" + }, + { + label: "规格", + className: ".min-w-0.font-medium.break-all + div" + }, + { + label: "评价内容", + className: ".min-w-0.overflow-hidden", + condition: { + list: [ + "span.cursor-pointer" + ], + time: 200, + }, + + }, + ], + pagination: { + nextBtn: ".eds-react-pagination-pager__button-next", + maxPage: 2, // 最大爬取页数 + delay: 2000 // 翻页后的等待加载时间 + }, + + }, + ] + }, + { + name: "账户健康状态", + uniqueKey: "accounthealth", + url: "https://seller.shopee.com.my/portal/accounthealth/home", + checkSelector: '.page-container', + fields: [ + { + label: "健康状态", + className: ".metric-content", + type: 1, + keys: [ + { + label: "模块名", + className: ".metric-type" + }, + { + label: "值", + className: ".metric-item", + type: 1, + keys: [ + { + label: "指标", + className: "p.metric-text" + }, + { + label: "值", + className: ".metric-my" + }, + { + label: "目标", + className: ".metric-target" + }, + { + label: "使用类型", + className: ".metric-applied-to" + }, + ] + }, + ], + + }, + ] + } ], }, ] diff --git a/src/content/App.vue b/src/content/App.vue index d3c797f..6611a8e 100644 --- a/src/content/App.vue +++ b/src/content/App.vue @@ -12,7 +12,7 @@ const isPanelOpen = ref(false); let timer: number | undefined; // 只有任务处于运行中时,才在网页右下角展示计时按钮。 -const isVisible = computed(() => crawlState.value?.status === 'running'); +const isVisible = computed(() => crawlState.value ? ['running', 'paused'].includes(crawlState.value.status) : false); // 内容脚本挂载后立即同步一次状态,并开始每秒刷新计时和任务进度。 onMounted(() => { @@ -85,6 +85,14 @@ function getStepText(status: string): string { return textMap[status] ?? status; } +/** + * 请求 background 继续暂停中的爬取任务。 + */ +async function handleResumeCrawl() { + await sendBackgroundMessage({ action: 'RESUME_CRAWL' }); + await refreshCrawlState(); +} + /** * 发送消息到 background;非扩展环境下返回空成功响应,方便本地页面不报错。 */ @@ -114,15 +122,20 @@ function sendBackgroundMessage(message: unknown): Promise<{ ok: boolean; data
  1. -
    - {{ index + 1 }}. {{ step.name }} - {{ getStepText(step.status) }} - {{ step.message }} -
    -
  2. -
- - +
+ {{ index + 1 }}. {{ step.name }} + {{ getStepText(step.status) }} + {{ step.message }} +
+ + + +
+

{{ crawlState.pause.message }}

+ +
+ +