This commit is contained in:
zhu
2026-05-06 14:04:05 +08:00
parent d78d70bde0
commit 40df507300
17 changed files with 691 additions and 163 deletions

1
.gitignore vendored
View File

@@ -9,6 +9,7 @@ lerna-debug.log*
node_modules
dist
storeai-extension-v0.1.0
dist-ssr
*.local

View File

@@ -19,5 +19,3 @@
7.在窗口中记得显示一个取消按钮,点击后关闭窗口,取消爬取
# 具体代码实现流程
请阅读./step.md文档并严格按照步骤进行执行

View File

@@ -69,7 +69,7 @@ async function autoClick(config: PlatformFieldConfig, rootDom: ParentNode): Prom
/**
* 递归处理字段配置,支持普通字段、嵌套 row、列表和表格。
*/
async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode): Promise<DomScrapeResult> {
export async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode): Promise<DomScrapeResult> {
const result: DomScrapeResult = {};
for (const item of columns) {

View File

@@ -1,9 +1,16 @@
import { getPlatformById } from '@/config/platforms';
import type { CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types';
import { scrapeDomFields, type DomScrapeResult } from '../domScraper';
import type { CrawlPauseInfo, CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types';
import type { DomScrapeResult } from '../domScraper';
import type { CrawlStateResponse } from '../types';
import { getCrawlTaskState, setCrawlTaskState, updateCrawlTaskState } from './taskState';
interface PageRunnerResponse {
ok: boolean;
data?: DomScrapeResult | null;
interrupt?: CrawlPauseInfo;
error?: string;
}
/**
* 创建新的爬取任务,打开目标平台窗口,并把初始时间轴状态写入 storage。
*/
@@ -86,6 +93,29 @@ export async function cancelCrawl(): Promise<CrawlStateResponse> {
return { ok: true, data: canceledState };
}
/**
* 用户处理完登录、验证码或风控后,恢复当前暂停中的爬取任务。
*/
export async function resumeCrawl(): Promise<CrawlStateResponse> {
const state = await getCrawlTaskState();
if (!state || state.status !== 'paused') {
return { ok: true, data: state };
}
const resumedState: CrawlTaskState = {
...state,
status: 'running',
pause: undefined,
steps: state.steps.map((step, index) =>
index === state.currentStepIndex ? { ...step, status: 'running', message: undefined } : step,
),
};
await setCrawlTaskState(resumedState);
return { ok: true, data: resumedState };
}
/**
* 窗口关闭后,如果关闭的是爬取窗口,就把当前任务标记为取消。
*/
@@ -114,20 +144,30 @@ async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskSt
}
try {
const tabId = await getWindowActiveTabId(initialState.windowId);
for (let stepIndex = 0; stepIndex < platform.steps.length; stepIndex += 1) {
const step = platform.steps[stepIndex];
let shouldRetryStep = true;
while (shouldRetryStep) {
const currentState = await getCrawlTaskState();
if (currentState?.id !== initialState.id || currentState.status !== 'running') {
if (currentState?.id !== initialState.id || currentState.status === 'canceled') {
return;
}
if (currentState.status === 'paused') {
const resumed = await waitUntilResumed(initialState.id);
if (!resumed) {
return;
}
}
await updateCrawlTaskState(initialState.id, (state) => ({
...state,
currentStepIndex: stepIndex,
status: 'running',
pause: undefined,
steps: state.steps.map((item, index) => ({
...item,
status: index === stepIndex ? 'running' : item.status,
@@ -135,32 +175,50 @@ async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskSt
})),
}));
const tabId = await getWindowActiveTabId(initialState.windowId);
await chrome.tabs.update(tabId, { url: step.url, active: true });
await waitForTabLoaded(tabId);
const isReady = await waitForStepReady(tabId, step);
const response = await scrapeStepInContent(tabId, step);
if (response.interrupt) {
await pauseForInterrupt(initialState.id, stepIndex, response.interrupt);
const resumed = await waitUntilResumed(initialState.id);
if (!resumed) {
return;
}
continue;
}
if (!response.ok) {
const message = response.error ?? '页面抓取失败';
if (!isReady) {
await updateCrawlTaskState(initialState.id, (state) => ({
...state,
status: 'failed',
currentStepIndex: stepIndex,
steps: state.steps.map((item, index) =>
index === stepIndex ? { ...item, status: 'failed', message: '页面关键 DOM 未加载完成' } : item,
index === stepIndex ? { ...item, status: 'failed', message } : item,
),
}));
return;
}
const data = await scrapeStepFields(tabId, step);
console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, data);
console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, response.data);
await updateCrawlTaskState(initialState.id, (state) => ({
...state,
steps: state.steps.map((item, index) =>
index === stepIndex ? { ...item, status: 'success', message: undefined } : item,
index === stepIndex
? { ...item, status: 'success', message: undefined, result: response.data }
: item,
),
}));
shouldRetryStep = false;
}
}
await updateCrawlTaskState(initialState.id, (state) => ({
@@ -198,55 +256,90 @@ async function getWindowActiveTabId(windowId: number): Promise<number> {
}
/**
* 等待步骤配置中的 checkSelector 出现;第一次超时后刷新页面再重试一次
* 让 content script 直接在目标页面执行检查和抓取
*/
async function waitForStepReady(tabId: number, step: PlatformStepConfig): Promise<boolean> {
if (await waitForSelector(tabId, step.checkSelector, 5000)) {
return true;
}
await chrome.tabs.reload(tabId);
await waitForTabLoaded(tabId);
return waitForSelector(tabId, step.checkSelector, 5000);
}
/**
* 在目标页面轮询检查指定 selector 是否存在。
*/
async function waitForSelector(tabId: number, selector: string, timeoutMs: number): Promise<boolean> {
async function scrapeStepInContent(tabId: number, step: PlatformStepConfig): Promise<PageRunnerResponse> {
const startedAt = Date.now();
while (Date.now() - startedAt < timeoutMs) {
const results = await chrome.scripting.executeScript({
target: { tabId },
func: (targetSelector: string) => Boolean(document.querySelector(targetSelector)),
args: [selector],
while (Date.now() - startedAt < 20000) {
const response = await sendPageRunnerMessage(tabId, {
action: 'SCRAPE_STEP',
payload: {
fields: step.fields,
checkSelector: step.checkSelector,
},
});
if (Boolean(results[0]?.result)) {
return true;
if (response.ok || response.interrupt || !isPageRunnerNotReadyError(response.error)) {
return response;
}
await new Promise((resolve) => {
globalThis.setTimeout(resolve, 500);
});
await sleep(500);
}
return false;
return { ok: false, error: '页面脚本未响应,请刷新扩展后重试' };
}
/**
* 注入 domScraper 到目标页面,并根据当前 step.fields 提取页面数据
* 给目标页的 content script 发送页面执行消息
*/
async function scrapeStepFields(tabId: number, step: PlatformStepConfig): Promise<DomScrapeResult | null> {
const results = await chrome.scripting.executeScript({
target: { tabId },
func: scrapeDomFields,
args: [step.fields],
});
async function sendPageRunnerMessage(tabId: number, message: unknown): Promise<PageRunnerResponse> {
try {
const response = await chrome.tabs.sendMessage(tabId, message);
return results[0]?.result ?? null;
if (response && typeof response === 'object') {
return response as PageRunnerResponse;
}
return { ok: false, error: '页面脚本返回为空' };
} catch (error: unknown) {
return { ok: false, error: error instanceof Error ? error.message : String(error) };
}
}
/**
* 判断错误是否只是 content script 尚未注入完成。
*/
function isPageRunnerNotReadyError(error?: string): boolean {
if (!error) {
return false;
}
return /receiving end does not exist|could not establish connection|no receiving end/i.test(error);
}
/**
* 因登录、验证码或页面异常暂停当前任务。
*/
async function pauseForInterrupt(taskId: string, stepIndex: number, interrupt: CrawlPauseInfo): Promise<void> {
await updateCrawlTaskState(taskId, (state) => ({
...state,
status: 'paused',
pause: interrupt,
currentStepIndex: stepIndex,
steps: state.steps.map((step, index) =>
index === stepIndex ? { ...step, status: 'running', message: interrupt.message } : step,
),
}));
}
/**
* 暂停后等待用户点继续或取消。
*/
async function waitUntilResumed(taskId: string): Promise<boolean> {
while (true) {
const state = await getCrawlTaskState();
if (!state || state.id !== taskId || state.status === 'canceled' || state.status === 'failed') {
return false;
}
if (state.status === 'running') {
return true;
}
await sleep(1000);
}
}
/**
@@ -257,7 +350,7 @@ function createCrawlWindow(url: string): Promise<chrome.windows.Window> {
chrome.windows.create(
{
url,
type: 'popup',
type: 'normal',
focused: true,
width: 1280,
height: 900,
@@ -302,3 +395,12 @@ function waitForTabLoaded(tabId: number): Promise<void> {
chrome.tabs.onUpdated.addListener(handleUpdated);
});
}
/**
* 简单等待工具。
*/
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => {
globalThis.setTimeout(resolve, ms);
});
}

View File

@@ -1,5 +1,5 @@
import type { BackgroundCommand, BackgroundResponse, CrawlStateResponse } from '../types';
import { cancelCrawl, cancelCrawlWhenWindowRemoved, startCrawl } from './crawlTask';
import { cancelCrawl, cancelCrawlWhenWindowRemoved, resumeCrawl, startCrawl } from './crawlTask';
import { getCrawlTaskState } from './taskState';
/**
@@ -37,6 +37,8 @@ export async function handleBackgroundCommand(
return { ok: true, data: await getCrawlTaskState() };
case 'CANCEL_CRAWL':
return cancelCrawl();
case 'RESUME_CRAWL':
return resumeCrawl();
default:
return { ok: false, error: '未知的后台指令' };
}

View File

@@ -23,8 +23,14 @@ export interface CancelCrawlCommand {
action: 'CANCEL_CRAWL';
}
// 继续当前暂停中的爬取任务。
export interface ResumeCrawlCommand {
// 消息动作类型:用户已处理登录/验证码,允许 background 继续重试当前步骤。
action: 'RESUME_CRAWL';
}
// popup/content script 能发送给 background 的全部消息类型。
export type BackgroundCommand = StartCrawlCommand | GetCrawlStateCommand | CancelCrawlCommand;
export type BackgroundCommand = StartCrawlCommand | GetCrawlStateCommand | CancelCrawlCommand | ResumeCrawlCommand;
// background 统一响应结构。
export interface BackgroundResponse<T = unknown> {

View File

@@ -10,7 +10,7 @@ export const PLATFORM_CONFIGS: PlatformConfig[] = [
name: '数据看板',
uniqueKey: 'databoard',
url: 'https://seller.shopee.com.my/',
checkSelector: '.rate-manager-content',
checkSelector: '.page-container',
fields: [
{
label: "出货统计",
@@ -119,6 +119,191 @@ export const PLATFORM_CONFIGS: PlatformConfig[] = [
},
],
},
{
name: "广告中心",
uniqueKey: "adscenter",
url: "https://seller.shopee.com.my/portal/marketing/pas/index",
checkSelector: '.page-container',
fields: [
{
label: "我的账户",
className: ".my-account-wrap",
keys: [
{
label: "广告余额",
className: ".credit-expense-label-wrapper:nth-child(1) .ellipsis-content"
},
{
label: "今日广告花费",
className: ".credit-expense-label-wrapper:nth-child(2) .ellipsis-content"
},
]
},
{
label: "进行中广告列表",
className: ".eds-table__body-container",
type: 2,
tableParts: [
{ name: "fixed", select: ".eds-table__fix-body" },
{ name: "main", select: ".eds-table__main-body" }
],
keys: [
{
label: "广告信息",
className: ".info-containter",
part: "fixed",
keys: [
{
label: "广告名称",
className: ".campaign-name-container"
},
{
label: "广告类型",
className: ".gmv-max-noti"
},
{
label: "结束时间",
className: ".time-edit-wrapper"
}
]
},
{
label: "每日预算",
part: "main",
className: "td:nth-child(1)"
},
{
label: "目标ROAS",
part: "main",
className: "td:nth-child(2)"
},
{
label: "花费",
part: "main",
className: "td:nth-child(4)"
},
{
label: "销售额",
part: "main",
className: "td:nth-child(5)"
},
{
label: "广告支出回报率",
part: "main",
className: "td:nth-child(6)"
}
],
pagination: {
nextBtn: ".eds-pager__button-next", // 下一页按钮
disabledClass: ".eds-button--disabled", // 按钮禁用时的class用来判断结束
maxPage: 1, // 最大爬取页数
delay: 2000 // 翻页后的等待加载时间
},
}
]
},
{
name: "评论管理",
uniqueKey: "message",
url: "https://seller.shopee.com.my/portal/settings/shop/rating",
checkSelector: '.page-container',
fields: [
{
label: "低星评论",
className: ".border-solid.rounded",
condition: {
list: [
".flex.items-center.mt-6 div:nth-child(3)",
".eds-react-checkbox-group label:nth-child(2)",
".eds-react-checkbox-group label:nth-child(3)",
".eds-react-checkbox-group label:nth-child(4)"
],
time: 200,
},
type: 1,
keys: [
{
label: "用户",
className: ".flex.items-center.justify-start .ml-2"
},
{
label: "订单编号",
className: ".underline.px-1"
},
{
label: "商品名称",
className: ".min-w-0.font-medium.break-all"
},
{
label: "规格",
className: ".min-w-0.font-medium.break-all + div"
},
{
label: "评价内容",
className: ".min-w-0.overflow-hidden",
condition: {
list: [
"span.cursor-pointer"
],
time: 200,
},
},
],
pagination: {
nextBtn: ".eds-react-pagination-pager__button-next",
maxPage: 2, // 最大爬取页数
delay: 2000 // 翻页后的等待加载时间
},
},
]
},
{
name: "账户健康状态",
uniqueKey: "accounthealth",
url: "https://seller.shopee.com.my/portal/accounthealth/home",
checkSelector: '.page-container',
fields: [
{
label: "健康状态",
className: ".metric-content",
type: 1,
keys: [
{
label: "模块名",
className: ".metric-type"
},
{
label: "值",
className: ".metric-item",
type: 1,
keys: [
{
label: "指标",
className: "p.metric-text"
},
{
label: "值",
className: ".metric-my"
},
{
label: "目标",
className: ".metric-target"
},
{
label: "使用类型",
className: ".metric-applied-to"
},
]
},
],
},
]
}
],
},
]

View File

@@ -12,7 +12,7 @@ const isPanelOpen = ref(false);
let timer: number | undefined;
// 只有任务处于运行中时,才在网页右下角展示计时按钮。
const isVisible = computed(() => crawlState.value?.status === 'running');
const isVisible = computed(() => crawlState.value ? ['running', 'paused'].includes(crawlState.value.status) : false);
// 内容脚本挂载后立即同步一次状态,并开始每秒刷新计时和任务进度。
onMounted(() => {
@@ -85,6 +85,14 @@ function getStepText(status: string): string {
return textMap[status] ?? status;
}
/**
* 请求 background 继续暂停中的爬取任务。
*/
async function handleResumeCrawl() {
await sendBackgroundMessage({ action: 'RESUME_CRAWL' });
await refreshCrawlState();
}
/**
* 发送消息到 background非扩展环境下返回空成功响应方便本地页面不报错。
*/
@@ -121,6 +129,11 @@ function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data
</div>
</li>
</ol>
<div v-if="crawlState.status === 'paused' && crawlState.pause" class="dianshan-crawl-pause">
<p>{{ crawlState.pause.message }}</p>
<button type="button" @click="handleResumeCrawl">我已处理继续</button>
</div>
</section>
</div>
</template>
@@ -223,6 +236,35 @@ function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data
color: #b91c1c;
}
.dianshan-crawl-pause {
display: grid;
gap: 8px;
margin-top: 12px;
padding: 10px;
border: 1px solid #f59e0b;
border-radius: 8px;
background: #fffbeb;
}
.dianshan-crawl-pause p {
margin: 0;
color: #92400e;
font-size: 12px;
line-height: 1.5;
}
.dianshan-crawl-pause button {
width: 100%;
border: 0;
border-radius: 6px;
padding: 8px 10px;
color: #ffffff;
background: #059669;
cursor: pointer;
font-size: 12px;
font-weight: 700;
}
.is-running .dianshan-crawl-dot,
.is-success .dianshan-crawl-dot {
background: #10b981;

View File

@@ -1,5 +1,6 @@
import { createApp } from 'vue';
import { createApp } from 'vue';
import App from './App.vue';
import { setupPageRunner } from './pageRunner';
/**
* 将内容脚本应用挂载到页面中。
@@ -9,17 +10,15 @@ function mountApp() {
return;
}
// 内容脚本在宿主页面中的根容器。
// 用于避免污染业务页面结构。
const container = document.createElement('div');
container.id = 'dianshan-crx-root';
// Vue 应用实际挂载的节点。
const appRoot = document.createElement('div');
container.appendChild(appRoot);
document.body.appendChild(container);
createApp(App).mount(appRoot);
setupPageRunner();
}
if (document.readyState === 'loading') {

207
src/content/pageRunner.ts Normal file
View File

@@ -0,0 +1,207 @@
import { processFields, type DomScrapeResult } from '@/background/domScraper';
import type { CrawlPauseInfo, PlatformFieldConfig } from '@/types';
interface ScrapeStepMessage {
action: 'SCRAPE_STEP';
payload: {
fields: PlatformFieldConfig[];
checkSelector: string;
};
}
interface CheckInterruptMessage {
action: 'CHECK_INTERRUPT';
}
type PageRunnerMessage = ScrapeStepMessage | CheckInterruptMessage;
interface PageRunnerResponse {
ok: boolean;
data?: DomScrapeResult | null;
interrupt?: CrawlPauseInfo;
error?: string;
}
/**
* 注册页面执行器,供 background 在目标网页中触发中断检测和 DOM 抓取。
*/
export function setupPageRunner(): void {
chrome.runtime.onMessage.addListener((message: PageRunnerMessage, _sender, sendResponse) => {
void handlePageRunnerMessage(message).then(sendResponse);
return true;
});
}
/**
* 处理 background 发来的页面执行消息。
*/
async function handlePageRunnerMessage(message: PageRunnerMessage): Promise<PageRunnerResponse> {
if (message.action === 'CHECK_INTERRUPT') {
return { ok: true, interrupt: detectPageInterrupt() };
}
if (message.action === 'SCRAPE_STEP') {
const interrupt = detectPageInterrupt();
if (interrupt) {
return { ok: false, interrupt };
}
const readyElement = await waitForStableSelector(message.payload.checkSelector, 18000);
if (!readyElement) {
return {
ok: false,
interrupt: {
reason: 'page_not_ready',
message: '页面关键内容暂未加载,请确认页面是否正常显示后继续',
},
};
}
const data = await processFields(message.payload.fields, document.body);
return { ok: true, data };
}
return { ok: false, error: '未知页面执行指令' };
}
/**
* 检测当前页面是否需要用户手动处理登录、验证码或页面不存在。
*/
function detectPageInterrupt(): CrawlPauseInfo | undefined {
if (isShieldPage()) {
return {
reason: 'shield',
message: '检测到验证码或风控验证,请在打开的商家后台窗口处理完成后继续',
};
}
if (isLoginPage()) {
return {
reason: 'reauth',
message: '检测到需要重新登录,请在打开的商家后台窗口登录完成后继续',
};
}
if (isNotFoundPage()) {
return {
reason: 'not_found',
message: '当前页面不存在或已失效,请确认平台配置里的页面地址是否正确',
};
}
}
/**
* 判断是否进入验证码、流量盾或风控验证页。
*/
function isShieldPage(): boolean {
const path = location.pathname.toLowerCase();
if (path.startsWith('/verify/captcha') || path.startsWith('/verify/traffic')) {
return true;
}
const shieldElement = document.querySelector(
'[data-name="verification"], .ant-captcha, #captchaContainer, [class*="captcha" i], [id*="captcha" i]',
);
return shieldElement ? isVisibleElement(shieldElement) : false;
}
/**
* 判断当前页面是否需要登录或二次验证密码。
*/
function isLoginPage(): boolean {
const path = location.pathname.toLowerCase();
if (
/^\/(?:buyer\/)?login\b/i.test(path) ||
/^\/account\/(?:signin|login)\b/i.test(path) ||
/^\/portal\/login\b/i.test(path)
) {
return true;
}
const visiblePasswordInput = Array.from(document.querySelectorAll('input[type="password"]')).some(isVisibleElement);
if (visiblePasswordInput) {
return true;
}
const bodyText = document.body.innerText.slice(0, 3000);
const loginTextPatterns = [
/enter\s+(your\s+)?password\s+to\s+continue/i,
/sign\s+in\s+(again\s+)?to\s+continue/i,
/please\s+(re-?)?enter\s+(your\s+)?password/i,
/请(再次|重新)?输入(您的)?密码/,
/请登录|重新登录|登录后继续/,
];
return loginTextPatterns.some((pattern) => pattern.test(bodyText));
}
/**
* 判断当前页面是否是不存在、下架或错误页面。
*/
function isNotFoundPage(): boolean {
const text = document.body.innerText.slice(0, 8000);
const title = document.title;
const notFoundPatterns = [
/page\s+not\s+found/i,
/the\s+page\s+you\s+are\s+looking\s+for/i,
/this\s+page\s+(has\s+been\s+)?removed/i,
/product\s+(is\s+)?unavailable/i,
/页面不存在|找不到(此|该)?页面|抱歉.*不存在|(商品|产品)已下架/,
];
return notFoundPatterns.some((pattern) => pattern.test(title) || pattern.test(text));
}
/**
* 等待页面中出现稳定的关键元素。
*/
async function waitForStableSelector(selector: string, timeoutMs: number): Promise<Element | null> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
const element = document.querySelector(selector);
if (element && isVisibleElement(element)) {
await sleep(600);
const stableElement = document.querySelector(selector);
return stableElement && isVisibleElement(stableElement) ? stableElement : null;
}
await sleep(500);
}
return null;
}
/**
* 判断元素是否真实可见。
*/
function isVisibleElement(element: Element): boolean {
if (!element.isConnected) {
return false;
}
const style = element.ownerDocument.defaultView?.getComputedStyle(element);
if (!style || style.display === 'none' || style.visibility === 'hidden' || Number(style.opacity) < 0.05) {
return false;
}
const rect = element.getBoundingClientRect();
return rect.width > 0 && rect.height > 0;
}
/**
* 简单等待工具。
*/
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => {
window.setTimeout(resolve, ms);
});
}

View File

@@ -18,7 +18,9 @@ const selectedPlatform = computed(() =>
);
const isLoggedIn = computed(() => token.value !== null);
const isCrawling = computed(() => crawlState.value?.status === 'running');
const shouldShowCrawlProgress = computed(() =>
crawlState.value ? ['running', 'paused', 'completed', 'failed'].includes(crawlState.value.status) : false,
);
onMounted(async () => {
token.value = await getToken();
@@ -82,6 +84,11 @@ async function handleCancelCrawl() {
crawlState.value = response.data ?? null;
}
async function handleResumeCrawl() {
const response = await sendBackgroundMessage<CrawlTaskState>({ action: 'RESUME_CRAWL' });
crawlState.value = response.data ?? null;
}
async function refreshCrawlState() {
const response = await sendBackgroundMessage<CrawlTaskState | null>({ action: 'GET_CRAWL_STATE' });
@@ -162,18 +169,32 @@ function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data
</button>
</template>
<template v-else-if="isCrawling && crawlState">
<template v-else-if="shouldShowCrawlProgress && crawlState">
<section class="space-y-4">
<div class="flex items-center justify-between rounded-md bg-white px-3 py-2 shadow-sm">
<div>
<p class="text-sm font-medium text-slate-800">{{ crawlState.platformName }}</p>
<p class="text-xs text-slate-500">已运行 {{ formatElapsed(elapsedSeconds) }}</p>
<p class="text-xs text-slate-500">
{{ crawlState.status === 'paused' ? '已暂停' : '已运行 ' + formatElapsed(elapsedSeconds) }}
</p>
</div>
<div class="flex items-center gap-2">
<button v-if="crawlState.status === 'paused'" type="button"
class="text-xs text-emerald-600 transition hover:text-emerald-700"
@click="handleResumeCrawl">
继续
</button>
<button type="button" class="text-xs text-red-600 transition hover:text-red-700"
@click="handleCancelCrawl">
取消
</button>
</div>
</div>
<div v-if="crawlState.status === 'paused' && crawlState.pause"
class="rounded-md border border-amber-200 bg-amber-50 px-3 py-2 text-sm text-amber-800">
{{ crawlState.pause.message }}
</div>
<ol class="space-y-3">
<li v-for="(step, index) in crawlState.steps" :key="step.uniqueKey"
@@ -187,6 +208,8 @@ function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data
<span class="text-xs">{{ getStepText(step.status) }}</span>
</div>
<p v-if="step.message" class="mt-1 text-xs">{{ step.message }}</p>
<pre v-if="step.result !== undefined"
class="mt-2 max-h-32 overflow-auto rounded bg-slate-950 p-2 text-[11px] leading-4 text-slate-100">{{ JSON.stringify(step.result, null, 2) }}</pre>
</div>
</li>
</ol>

View File

@@ -2,7 +2,7 @@
export type CrawlStepStatus = 'pending' | 'running' | 'success' | 'failed';
// 整体爬取任务状态。
export type CrawlTaskStatus = 'running' | 'completed' | 'failed' | 'canceled';
export type CrawlTaskStatus = 'running' | 'paused' | 'completed' | 'failed' | 'canceled';
// 时间轴中的单个爬取步骤进度。
export interface CrawlProgressStep {
@@ -14,6 +14,16 @@ export interface CrawlProgressStep {
status: CrawlStepStatus;
// 状态补充说明,如失败原因。
message?: string;
// 当前步骤抓取到的数据结果。
result?: unknown;
}
// 爬取暂停原因,通常由登录、验证码或页面不存在触发。
export interface CrawlPauseInfo {
// 暂停原因编码。
reason: 'reauth' | 'shield' | 'not_found' | 'page_not_ready';
// 展示给用户看的处理提示。
message: string;
}
// 当前正在执行的爬取任务快照,供 popup 和 content script 同步展示。
@@ -30,6 +40,8 @@ export interface CrawlTaskState {
startedAt: number;
// 当前任务状态。
status: CrawlTaskStatus;
// 暂停信息;仅 status 为 paused 时存在。
pause?: CrawlPauseInfo;
// 当前执行到的步骤下标。
currentStepIndex: number;
// 平台 steps 映射出的时间轴进度。

View File

@@ -1,4 +1,5 @@
export type {
CrawlPauseInfo,
CrawlProgressStep,
CrawlStepStatus,
CrawlTaskState,

View File

@@ -23,18 +23,8 @@ export interface PlatformPaginationConfig {
// 表格分段配置,用于兼容一个数据块由多个 table 或多个 table 片段组成的情况。
export interface PlatformTablePartConfig {
// 当前 table 或表格片段的名称。
label: string;
// 当前 table 或表格片段的兼容名称,兼容 message.js 中的 name 写法。
name?: string;
// 当前 table 或表格片段的 CSS 选择器。
className: string;
// 当前 table 或表格片段的兼容选择器,兼容 message.js 中的 select 写法。
select?: string;
// 行元素选择器,不填时由采集逻辑使用默认行选择器。
rowSelector?: string;
// 当前 table 或表格片段下需要采集的字段。
keys?: PlatformFieldConfig[];
}
// 页面字段配置,描述一个普通元素、列表元素或表格元素如何从 DOM 中提取数据。

52
step.md
View File

@@ -1,52 +0,0 @@
# 项目结构
```angular2html
src:.
├─assets # 静态资源目录
│ vite.svg # 这里的资源通常用于图标、Logo 或扩展程序内部引用的图片
├─background # 后台脚本 (Background Script / Service Worker)
│ index.ts # 扩展的“大脑”,常驻后台运行,处理事件监听、报文转发、存储管理等
├─config # 配置目录
│ platforms.ts # 自定义配置,各种平台(如不同网站、不同浏览器)的适配配置
├─content # 内容脚本 (Content Script)
│ │ App.vue # 注入到网页中的 UI 组件(通常用于在目标页面侧边栏或浮窗显示界面)
│ │ main.ts # 内容脚本的入口文件,负责将 Vue 组件挂载到宿主页面的 DOM 中
│ │
│ └─views # 内容脚本相关的子视图或组件
├─options # 选项页 (Options Page)
│ App.vue # 扩展设置页面的 UI右键扩展图标点击“选项”打开的页面
│ index.html # 选项页的 HTML 宿主文件
│ main.ts # 选项页的 Vue 入口文件
├─popup # 弹窗页 (Popup Page)
│ App.vue # 点击扩展图标时显示的弹出框 UI
│ index.html # 弹窗页的 HTML 宿主文件
│ main.ts # 弹窗页的 Vue 入口文件
├─shared # 共享代码库 (Shared)
│ # 存放被 background、content、popup 等多个模块共同引用的工具函数、常量、API封装等
└─types # 类型定义目录
index.ts # 存放全局的 TypeScript 接口Interface和类型Type定义
```
# 开发步骤
1.在popup模块中的App.vue中用tailwindcss编写点击扩展图标时出现的弹窗逻辑如下
- 在未登录情况下即storage中token字段是否存在如果不存在弹窗内容只用显示扩展名字、描述、请登录按钮底部扩展版本
- 当点击登录按钮后先模拟登录写死token之后ui如下
- 显示扩展名字、描述、一个平台选择框通过读取config/platforms.ts)的内容for循环显示平台、扫描按钮、最底部Row退出按钮扩展版本号
- 注意token的存储和获取逻辑放到/shared/auth.ts中去如果涉及到接口和枚举的定义请判断是否是全局类型
- 如果是该类型写到一个新文件中并放到types/下如果不是放到当前模块的types/目录下(如果没用,新建)
2.前提当1完成后点击popup的立即爬取已经可以打开一个新的窗口了
- 在所有网页(包括新打开的窗口和所有网页)的右下角都放一个圆形正计时(表示正在爬取中)
- 点击圆形正计时时出现一个popup内容如下
- 以时间轴的形式表示当前爬取进度根据platforms.ts中的steps
- 同时点击扩展的popup里的内容也变得和上面的时间轴内容一致显示爬取进度隐藏立即爬取等按钮
3.前提1和2都已完成ui和交互操作上ok
- 开始爬取网页中的数据查看message.js内容吧里面的爬取方法都提取出来放到background/domScraper.ts中去
- 基于2每次根据steps打开一个新网页后根据它的fields数组字段调用domScraper中的方法来提取数据并打印到控制台即可

View File

@@ -1 +1 @@
{"root":["./manifest.config.ts","./message.js","./vite.config.ts","./src/background/domscraper.ts","./src/background/index.ts","./src/background/service.ts","./src/background/types.ts","./src/background/service/crawltask.ts","./src/background/service/lifecycle.ts","./src/background/service/taskstate.ts","./src/config/platforms.ts","./src/content/app.vue","./src/content/main.ts","./src/options/app.vue","./src/options/main.ts","./src/popup/app.vue","./src/popup/main.ts","./src/shared/auth.ts","./src/types/crawl.ts","./src/types/index.ts","./src/types/platform.ts"],"version":"5.9.3"}
{"root":["./manifest.config.ts","./message.js","./vite.config.ts","./src/background/domscraper.ts","./src/background/index.ts","./src/background/service.ts","./src/background/types.ts","./src/background/service/crawltask.ts","./src/background/service/lifecycle.ts","./src/background/service/taskstate.ts","./src/config/platforms.ts","./src/content/app.vue","./src/content/main.ts","./src/content/pagerunner.ts","./src/options/app.vue","./src/options/main.ts","./src/popup/app.vue","./src/popup/main.ts","./src/shared/auth.ts","./src/types/crawl.ts","./src/types/index.ts","./src/types/platform.ts","./storeai-extension-v0.1.0/service-worker-loader.js","./storeai-extension-v0.1.0/assets/config-cf-xklo9.js","./storeai-extension-v0.1.0/assets/fetch-hook.ts-bvrghr__.js","./storeai-extension-v0.1.0/assets/index-dxg1qimp.js","./storeai-extension-v0.1.0/assets/index.ts-dirvxn_b.js","./storeai-extension-v0.1.0/assets/orchestrator.ts-bleul1fk.js","./storeai-extension-v0.1.0/assets/orchestrator.ts-loader-drev6v6h.js","./storeai-extension-v0.1.0/assets/popup-dbgvbs2c.js","./storeai-extension-v0.1.0/assets/selectors-xrdds_u0.js"],"version":"5.9.3"}

12
ww.md Normal file
View File

@@ -0,0 +1,12 @@
# 原型
请阅读storeai-extension-v0.1.0这个目录的代码后,请总结学习里面的开始爬取的执行流程、代码后,然后你在重构我这个项目
# 注意要求
- 数据爬取逻辑使用我写的方法通过domScraper中的processFields方法直接调用爬取数据
- 进度ui可以以原型中的为准
- 撞盾和登录效验,也以原型里的代码逻辑为准