1
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -9,6 +9,7 @@ lerna-debug.log*
|
||||
|
||||
node_modules
|
||||
dist
|
||||
storeai-extension-v0.1.0
|
||||
dist-ssr
|
||||
*.local
|
||||
|
||||
|
||||
@@ -19,5 +19,3 @@
|
||||
7.在窗口中记得显示一个取消按钮,点击后关闭窗口,取消爬取
|
||||
|
||||
|
||||
# 具体代码实现流程
|
||||
请阅读./step.md文档,并严格按照步骤进行执行
|
||||
@@ -69,7 +69,7 @@ async function autoClick(config: PlatformFieldConfig, rootDom: ParentNode): Prom
|
||||
/**
|
||||
* 递归处理字段配置,支持普通字段、嵌套 row、列表和表格。
|
||||
*/
|
||||
async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode): Promise<DomScrapeResult> {
|
||||
export async function processFields(columns: PlatformFieldConfig[], rootDom: ParentNode): Promise<DomScrapeResult> {
|
||||
const result: DomScrapeResult = {};
|
||||
|
||||
for (const item of columns) {
|
||||
|
||||
@@ -1,9 +1,16 @@
|
||||
import { getPlatformById } from '@/config/platforms';
|
||||
import type { CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types';
|
||||
import { scrapeDomFields, type DomScrapeResult } from '../domScraper';
|
||||
import type { CrawlPauseInfo, CrawlProgressStep, CrawlTaskState, PlatformConfig, PlatformStepConfig } from '@/types';
|
||||
import type { DomScrapeResult } from '../domScraper';
|
||||
import type { CrawlStateResponse } from '../types';
|
||||
import { getCrawlTaskState, setCrawlTaskState, updateCrawlTaskState } from './taskState';
|
||||
|
||||
interface PageRunnerResponse {
|
||||
ok: boolean;
|
||||
data?: DomScrapeResult | null;
|
||||
interrupt?: CrawlPauseInfo;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建新的爬取任务,打开目标平台窗口,并把初始时间轴状态写入 storage。
|
||||
*/
|
||||
@@ -86,6 +93,29 @@ export async function cancelCrawl(): Promise<CrawlStateResponse> {
|
||||
return { ok: true, data: canceledState };
|
||||
}
|
||||
|
||||
/**
|
||||
* 用户处理完登录、验证码或风控后,恢复当前暂停中的爬取任务。
|
||||
*/
|
||||
export async function resumeCrawl(): Promise<CrawlStateResponse> {
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (!state || state.status !== 'paused') {
|
||||
return { ok: true, data: state };
|
||||
}
|
||||
|
||||
const resumedState: CrawlTaskState = {
|
||||
...state,
|
||||
status: 'running',
|
||||
pause: undefined,
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === state.currentStepIndex ? { ...step, status: 'running', message: undefined } : step,
|
||||
),
|
||||
};
|
||||
|
||||
await setCrawlTaskState(resumedState);
|
||||
return { ok: true, data: resumedState };
|
||||
}
|
||||
|
||||
/**
|
||||
* 窗口关闭后,如果关闭的是爬取窗口,就把当前任务标记为取消。
|
||||
*/
|
||||
@@ -114,20 +144,30 @@ async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskSt
|
||||
}
|
||||
|
||||
try {
|
||||
const tabId = await getWindowActiveTabId(initialState.windowId);
|
||||
|
||||
for (let stepIndex = 0; stepIndex < platform.steps.length; stepIndex += 1) {
|
||||
const step = platform.steps[stepIndex];
|
||||
let shouldRetryStep = true;
|
||||
|
||||
while (shouldRetryStep) {
|
||||
const currentState = await getCrawlTaskState();
|
||||
|
||||
if (currentState?.id !== initialState.id || currentState.status !== 'running') {
|
||||
if (currentState?.id !== initialState.id || currentState.status === 'canceled') {
|
||||
return;
|
||||
}
|
||||
|
||||
if (currentState.status === 'paused') {
|
||||
const resumed = await waitUntilResumed(initialState.id);
|
||||
|
||||
if (!resumed) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
...state,
|
||||
currentStepIndex: stepIndex,
|
||||
status: 'running',
|
||||
pause: undefined,
|
||||
steps: state.steps.map((item, index) => ({
|
||||
...item,
|
||||
status: index === stepIndex ? 'running' : item.status,
|
||||
@@ -135,32 +175,50 @@ async function runCrawlSteps(platform: PlatformConfig, initialState: CrawlTaskSt
|
||||
})),
|
||||
}));
|
||||
|
||||
const tabId = await getWindowActiveTabId(initialState.windowId);
|
||||
await chrome.tabs.update(tabId, { url: step.url, active: true });
|
||||
await waitForTabLoaded(tabId);
|
||||
|
||||
const isReady = await waitForStepReady(tabId, step);
|
||||
const response = await scrapeStepInContent(tabId, step);
|
||||
|
||||
if (response.interrupt) {
|
||||
await pauseForInterrupt(initialState.id, stepIndex, response.interrupt);
|
||||
const resumed = await waitUntilResumed(initialState.id);
|
||||
|
||||
if (!resumed) {
|
||||
return;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
const message = response.error ?? '页面抓取失败';
|
||||
|
||||
if (!isReady) {
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
...state,
|
||||
status: 'failed',
|
||||
currentStepIndex: stepIndex,
|
||||
steps: state.steps.map((item, index) =>
|
||||
index === stepIndex ? { ...item, status: 'failed', message: '页面关键 DOM 未加载完成' } : item,
|
||||
index === stepIndex ? { ...item, status: 'failed', message } : item,
|
||||
),
|
||||
}));
|
||||
return;
|
||||
}
|
||||
|
||||
const data = await scrapeStepFields(tabId, step);
|
||||
console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, data);
|
||||
console.log(`[crawl] ${platform.name} - ${step.name} 提取成功`, response.data);
|
||||
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
...state,
|
||||
steps: state.steps.map((item, index) =>
|
||||
index === stepIndex ? { ...item, status: 'success', message: undefined } : item,
|
||||
index === stepIndex
|
||||
? { ...item, status: 'success', message: undefined, result: response.data }
|
||||
: item,
|
||||
),
|
||||
}));
|
||||
|
||||
shouldRetryStep = false;
|
||||
}
|
||||
}
|
||||
|
||||
await updateCrawlTaskState(initialState.id, (state) => ({
|
||||
@@ -198,55 +256,90 @@ async function getWindowActiveTabId(windowId: number): Promise<number> {
|
||||
}
|
||||
|
||||
/**
|
||||
* 等待步骤配置中的 checkSelector 出现;第一次超时后刷新页面再重试一次。
|
||||
* 让 content script 直接在目标页面执行检查和抓取。
|
||||
*/
|
||||
async function waitForStepReady(tabId: number, step: PlatformStepConfig): Promise<boolean> {
|
||||
if (await waitForSelector(tabId, step.checkSelector, 5000)) {
|
||||
return true;
|
||||
async function scrapeStepInContent(tabId: number, step: PlatformStepConfig): Promise<PageRunnerResponse> {
|
||||
const startedAt = Date.now();
|
||||
|
||||
while (Date.now() - startedAt < 20000) {
|
||||
const response = await sendPageRunnerMessage(tabId, {
|
||||
action: 'SCRAPE_STEP',
|
||||
payload: {
|
||||
fields: step.fields,
|
||||
checkSelector: step.checkSelector,
|
||||
},
|
||||
});
|
||||
|
||||
if (response.ok || response.interrupt || !isPageRunnerNotReadyError(response.error)) {
|
||||
return response;
|
||||
}
|
||||
|
||||
await chrome.tabs.reload(tabId);
|
||||
await waitForTabLoaded(tabId);
|
||||
await sleep(500);
|
||||
}
|
||||
|
||||
return waitForSelector(tabId, step.checkSelector, 5000);
|
||||
return { ok: false, error: '页面脚本未响应,请刷新扩展后重试' };
|
||||
}
|
||||
|
||||
/**
|
||||
* 在目标页面轮询检查指定 selector 是否存在。
|
||||
* 给目标页的 content script 发送页面执行消息。
|
||||
*/
|
||||
async function waitForSelector(tabId: number, selector: string, timeoutMs: number): Promise<boolean> {
|
||||
const startedAt = Date.now();
|
||||
async function sendPageRunnerMessage(tabId: number, message: unknown): Promise<PageRunnerResponse> {
|
||||
try {
|
||||
const response = await chrome.tabs.sendMessage(tabId, message);
|
||||
|
||||
while (Date.now() - startedAt < timeoutMs) {
|
||||
const results = await chrome.scripting.executeScript({
|
||||
target: { tabId },
|
||||
func: (targetSelector: string) => Boolean(document.querySelector(targetSelector)),
|
||||
args: [selector],
|
||||
});
|
||||
|
||||
if (Boolean(results[0]?.result)) {
|
||||
return true;
|
||||
if (response && typeof response === 'object') {
|
||||
return response as PageRunnerResponse;
|
||||
}
|
||||
|
||||
await new Promise((resolve) => {
|
||||
globalThis.setTimeout(resolve, 500);
|
||||
});
|
||||
return { ok: false, error: '页面脚本返回为空' };
|
||||
} catch (error: unknown) {
|
||||
return { ok: false, error: error instanceof Error ? error.message : String(error) };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断错误是否只是 content script 尚未注入完成。
|
||||
*/
|
||||
function isPageRunnerNotReadyError(error?: string): boolean {
|
||||
if (!error) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* 注入 domScraper 到目标页面,并根据当前 step.fields 提取页面数据。
|
||||
*/
|
||||
async function scrapeStepFields(tabId: number, step: PlatformStepConfig): Promise<DomScrapeResult | null> {
|
||||
const results = await chrome.scripting.executeScript({
|
||||
target: { tabId },
|
||||
func: scrapeDomFields,
|
||||
args: [step.fields],
|
||||
});
|
||||
return /receiving end does not exist|could not establish connection|no receiving end/i.test(error);
|
||||
}
|
||||
|
||||
return results[0]?.result ?? null;
|
||||
/**
|
||||
* 因登录、验证码或页面异常暂停当前任务。
|
||||
*/
|
||||
async function pauseForInterrupt(taskId: string, stepIndex: number, interrupt: CrawlPauseInfo): Promise<void> {
|
||||
await updateCrawlTaskState(taskId, (state) => ({
|
||||
...state,
|
||||
status: 'paused',
|
||||
pause: interrupt,
|
||||
currentStepIndex: stepIndex,
|
||||
steps: state.steps.map((step, index) =>
|
||||
index === stepIndex ? { ...step, status: 'running', message: interrupt.message } : step,
|
||||
),
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* 暂停后等待用户点继续或取消。
|
||||
*/
|
||||
async function waitUntilResumed(taskId: string): Promise<boolean> {
|
||||
while (true) {
|
||||
const state = await getCrawlTaskState();
|
||||
|
||||
if (!state || state.id !== taskId || state.status === 'canceled' || state.status === 'failed') {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (state.status === 'running') {
|
||||
return true;
|
||||
}
|
||||
|
||||
await sleep(1000);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -257,7 +350,7 @@ function createCrawlWindow(url: string): Promise<chrome.windows.Window> {
|
||||
chrome.windows.create(
|
||||
{
|
||||
url,
|
||||
type: 'popup',
|
||||
type: 'normal',
|
||||
focused: true,
|
||||
width: 1280,
|
||||
height: 900,
|
||||
@@ -302,3 +395,12 @@ function waitForTabLoaded(tabId: number): Promise<void> {
|
||||
chrome.tabs.onUpdated.addListener(handleUpdated);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* 简单等待工具。
|
||||
*/
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => {
|
||||
globalThis.setTimeout(resolve, ms);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import type { BackgroundCommand, BackgroundResponse, CrawlStateResponse } from '../types';
|
||||
import { cancelCrawl, cancelCrawlWhenWindowRemoved, startCrawl } from './crawlTask';
|
||||
import { cancelCrawl, cancelCrawlWhenWindowRemoved, resumeCrawl, startCrawl } from './crawlTask';
|
||||
import { getCrawlTaskState } from './taskState';
|
||||
|
||||
/**
|
||||
@@ -37,6 +37,8 @@ export async function handleBackgroundCommand(
|
||||
return { ok: true, data: await getCrawlTaskState() };
|
||||
case 'CANCEL_CRAWL':
|
||||
return cancelCrawl();
|
||||
case 'RESUME_CRAWL':
|
||||
return resumeCrawl();
|
||||
default:
|
||||
return { ok: false, error: '未知的后台指令' };
|
||||
}
|
||||
|
||||
@@ -23,8 +23,14 @@ export interface CancelCrawlCommand {
|
||||
action: 'CANCEL_CRAWL';
|
||||
}
|
||||
|
||||
// 继续当前暂停中的爬取任务。
|
||||
export interface ResumeCrawlCommand {
|
||||
// 消息动作类型:用户已处理登录/验证码,允许 background 继续重试当前步骤。
|
||||
action: 'RESUME_CRAWL';
|
||||
}
|
||||
|
||||
// popup/content script 能发送给 background 的全部消息类型。
|
||||
export type BackgroundCommand = StartCrawlCommand | GetCrawlStateCommand | CancelCrawlCommand;
|
||||
export type BackgroundCommand = StartCrawlCommand | GetCrawlStateCommand | CancelCrawlCommand | ResumeCrawlCommand;
|
||||
|
||||
// background 统一响应结构。
|
||||
export interface BackgroundResponse<T = unknown> {
|
||||
|
||||
@@ -10,7 +10,7 @@ export const PLATFORM_CONFIGS: PlatformConfig[] = [
|
||||
name: '数据看板',
|
||||
uniqueKey: 'databoard',
|
||||
url: 'https://seller.shopee.com.my/',
|
||||
checkSelector: '.rate-manager-content',
|
||||
checkSelector: '.page-container',
|
||||
fields: [
|
||||
{
|
||||
label: "出货统计",
|
||||
@@ -119,6 +119,191 @@ export const PLATFORM_CONFIGS: PlatformConfig[] = [
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: "广告中心",
|
||||
uniqueKey: "adscenter",
|
||||
url: "https://seller.shopee.com.my/portal/marketing/pas/index",
|
||||
checkSelector: '.page-container',
|
||||
fields: [
|
||||
{
|
||||
label: "我的账户",
|
||||
className: ".my-account-wrap",
|
||||
keys: [
|
||||
{
|
||||
label: "广告余额",
|
||||
className: ".credit-expense-label-wrapper:nth-child(1) .ellipsis-content"
|
||||
},
|
||||
{
|
||||
label: "今日广告花费",
|
||||
className: ".credit-expense-label-wrapper:nth-child(2) .ellipsis-content"
|
||||
},
|
||||
|
||||
]
|
||||
},
|
||||
{
|
||||
label: "进行中广告列表",
|
||||
className: ".eds-table__body-container",
|
||||
type: 2,
|
||||
tableParts: [
|
||||
{ name: "fixed", select: ".eds-table__fix-body" },
|
||||
{ name: "main", select: ".eds-table__main-body" }
|
||||
],
|
||||
keys: [
|
||||
{
|
||||
label: "广告信息",
|
||||
className: ".info-containter",
|
||||
part: "fixed",
|
||||
keys: [
|
||||
{
|
||||
label: "广告名称",
|
||||
className: ".campaign-name-container"
|
||||
},
|
||||
{
|
||||
label: "广告类型",
|
||||
className: ".gmv-max-noti"
|
||||
},
|
||||
{
|
||||
label: "结束时间",
|
||||
className: ".time-edit-wrapper"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
label: "每日预算",
|
||||
part: "main",
|
||||
className: "td:nth-child(1)"
|
||||
},
|
||||
{
|
||||
label: "目标ROAS",
|
||||
part: "main",
|
||||
className: "td:nth-child(2)"
|
||||
},
|
||||
{
|
||||
label: "花费",
|
||||
part: "main",
|
||||
className: "td:nth-child(4)"
|
||||
},
|
||||
{
|
||||
label: "销售额",
|
||||
part: "main",
|
||||
className: "td:nth-child(5)"
|
||||
},
|
||||
{
|
||||
label: "广告支出回报率",
|
||||
part: "main",
|
||||
className: "td:nth-child(6)"
|
||||
}
|
||||
],
|
||||
pagination: {
|
||||
nextBtn: ".eds-pager__button-next", // 下一页按钮
|
||||
disabledClass: ".eds-button--disabled", // 按钮禁用时的class(用来判断结束)
|
||||
maxPage: 1, // 最大爬取页数
|
||||
delay: 2000 // 翻页后的等待加载时间
|
||||
},
|
||||
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
name: "评论管理",
|
||||
uniqueKey: "message",
|
||||
url: "https://seller.shopee.com.my/portal/settings/shop/rating",
|
||||
checkSelector: '.page-container',
|
||||
fields: [
|
||||
{
|
||||
label: "低星评论",
|
||||
className: ".border-solid.rounded",
|
||||
condition: {
|
||||
list: [
|
||||
".flex.items-center.mt-6 div:nth-child(3)",
|
||||
".eds-react-checkbox-group label:nth-child(2)",
|
||||
".eds-react-checkbox-group label:nth-child(3)",
|
||||
".eds-react-checkbox-group label:nth-child(4)"
|
||||
],
|
||||
time: 200,
|
||||
},
|
||||
type: 1,
|
||||
keys: [
|
||||
{
|
||||
label: "用户",
|
||||
className: ".flex.items-center.justify-start .ml-2"
|
||||
},
|
||||
{
|
||||
label: "订单编号",
|
||||
className: ".underline.px-1"
|
||||
},
|
||||
{
|
||||
label: "商品名称",
|
||||
className: ".min-w-0.font-medium.break-all"
|
||||
},
|
||||
{
|
||||
label: "规格",
|
||||
className: ".min-w-0.font-medium.break-all + div"
|
||||
},
|
||||
{
|
||||
label: "评价内容",
|
||||
className: ".min-w-0.overflow-hidden",
|
||||
condition: {
|
||||
list: [
|
||||
"span.cursor-pointer"
|
||||
],
|
||||
time: 200,
|
||||
},
|
||||
|
||||
},
|
||||
],
|
||||
pagination: {
|
||||
nextBtn: ".eds-react-pagination-pager__button-next",
|
||||
maxPage: 2, // 最大爬取页数
|
||||
delay: 2000 // 翻页后的等待加载时间
|
||||
},
|
||||
|
||||
},
|
||||
]
|
||||
},
|
||||
{
|
||||
name: "账户健康状态",
|
||||
uniqueKey: "accounthealth",
|
||||
url: "https://seller.shopee.com.my/portal/accounthealth/home",
|
||||
checkSelector: '.page-container',
|
||||
fields: [
|
||||
{
|
||||
label: "健康状态",
|
||||
className: ".metric-content",
|
||||
type: 1,
|
||||
keys: [
|
||||
{
|
||||
label: "模块名",
|
||||
className: ".metric-type"
|
||||
},
|
||||
{
|
||||
label: "值",
|
||||
className: ".metric-item",
|
||||
type: 1,
|
||||
keys: [
|
||||
{
|
||||
label: "指标",
|
||||
className: "p.metric-text"
|
||||
},
|
||||
{
|
||||
label: "值",
|
||||
className: ".metric-my"
|
||||
},
|
||||
{
|
||||
label: "目标",
|
||||
className: ".metric-target"
|
||||
},
|
||||
{
|
||||
label: "使用类型",
|
||||
className: ".metric-applied-to"
|
||||
},
|
||||
]
|
||||
},
|
||||
],
|
||||
|
||||
},
|
||||
]
|
||||
}
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
@@ -12,7 +12,7 @@ const isPanelOpen = ref(false);
|
||||
let timer: number | undefined;
|
||||
|
||||
// 只有任务处于运行中时,才在网页右下角展示计时按钮。
|
||||
const isVisible = computed(() => crawlState.value?.status === 'running');
|
||||
const isVisible = computed(() => crawlState.value ? ['running', 'paused'].includes(crawlState.value.status) : false);
|
||||
|
||||
// 内容脚本挂载后立即同步一次状态,并开始每秒刷新计时和任务进度。
|
||||
onMounted(() => {
|
||||
@@ -85,6 +85,14 @@ function getStepText(status: string): string {
|
||||
return textMap[status] ?? status;
|
||||
}
|
||||
|
||||
/**
|
||||
* 请求 background 继续暂停中的爬取任务。
|
||||
*/
|
||||
async function handleResumeCrawl() {
|
||||
await sendBackgroundMessage({ action: 'RESUME_CRAWL' });
|
||||
await refreshCrawlState();
|
||||
}
|
||||
|
||||
/**
|
||||
* 发送消息到 background;非扩展环境下返回空成功响应,方便本地页面不报错。
|
||||
*/
|
||||
@@ -121,6 +129,11 @@ function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data
|
||||
</div>
|
||||
</li>
|
||||
</ol>
|
||||
|
||||
<div v-if="crawlState.status === 'paused' && crawlState.pause" class="dianshan-crawl-pause">
|
||||
<p>{{ crawlState.pause.message }}</p>
|
||||
<button type="button" @click="handleResumeCrawl">我已处理,继续</button>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
</template>
|
||||
@@ -223,6 +236,35 @@ function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data
|
||||
color: #b91c1c;
|
||||
}
|
||||
|
||||
.dianshan-crawl-pause {
|
||||
display: grid;
|
||||
gap: 8px;
|
||||
margin-top: 12px;
|
||||
padding: 10px;
|
||||
border: 1px solid #f59e0b;
|
||||
border-radius: 8px;
|
||||
background: #fffbeb;
|
||||
}
|
||||
|
||||
.dianshan-crawl-pause p {
|
||||
margin: 0;
|
||||
color: #92400e;
|
||||
font-size: 12px;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.dianshan-crawl-pause button {
|
||||
width: 100%;
|
||||
border: 0;
|
||||
border-radius: 6px;
|
||||
padding: 8px 10px;
|
||||
color: #ffffff;
|
||||
background: #059669;
|
||||
cursor: pointer;
|
||||
font-size: 12px;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.is-running .dianshan-crawl-dot,
|
||||
.is-success .dianshan-crawl-dot {
|
||||
background: #10b981;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import { createApp } from 'vue';
|
||||
import { createApp } from 'vue';
|
||||
import App from './App.vue';
|
||||
import { setupPageRunner } from './pageRunner';
|
||||
|
||||
/**
|
||||
* 将内容脚本应用挂载到页面中。
|
||||
@@ -9,17 +10,15 @@ function mountApp() {
|
||||
return;
|
||||
}
|
||||
|
||||
// 内容脚本在宿主页面中的根容器。
|
||||
// 用于避免污染业务页面结构。
|
||||
const container = document.createElement('div');
|
||||
container.id = 'dianshan-crx-root';
|
||||
// Vue 应用实际挂载的节点。
|
||||
const appRoot = document.createElement('div');
|
||||
|
||||
container.appendChild(appRoot);
|
||||
document.body.appendChild(container);
|
||||
|
||||
createApp(App).mount(appRoot);
|
||||
setupPageRunner();
|
||||
}
|
||||
|
||||
if (document.readyState === 'loading') {
|
||||
|
||||
207
src/content/pageRunner.ts
Normal file
207
src/content/pageRunner.ts
Normal file
@@ -0,0 +1,207 @@
|
||||
import { processFields, type DomScrapeResult } from '@/background/domScraper';
|
||||
import type { CrawlPauseInfo, PlatformFieldConfig } from '@/types';
|
||||
|
||||
interface ScrapeStepMessage {
|
||||
action: 'SCRAPE_STEP';
|
||||
payload: {
|
||||
fields: PlatformFieldConfig[];
|
||||
checkSelector: string;
|
||||
};
|
||||
}
|
||||
|
||||
interface CheckInterruptMessage {
|
||||
action: 'CHECK_INTERRUPT';
|
||||
}
|
||||
|
||||
type PageRunnerMessage = ScrapeStepMessage | CheckInterruptMessage;
|
||||
|
||||
interface PageRunnerResponse {
|
||||
ok: boolean;
|
||||
data?: DomScrapeResult | null;
|
||||
interrupt?: CrawlPauseInfo;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* 注册页面执行器,供 background 在目标网页中触发中断检测和 DOM 抓取。
|
||||
*/
|
||||
export function setupPageRunner(): void {
|
||||
chrome.runtime.onMessage.addListener((message: PageRunnerMessage, _sender, sendResponse) => {
|
||||
void handlePageRunnerMessage(message).then(sendResponse);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* 处理 background 发来的页面执行消息。
|
||||
*/
|
||||
async function handlePageRunnerMessage(message: PageRunnerMessage): Promise<PageRunnerResponse> {
|
||||
if (message.action === 'CHECK_INTERRUPT') {
|
||||
return { ok: true, interrupt: detectPageInterrupt() };
|
||||
}
|
||||
|
||||
if (message.action === 'SCRAPE_STEP') {
|
||||
const interrupt = detectPageInterrupt();
|
||||
|
||||
if (interrupt) {
|
||||
return { ok: false, interrupt };
|
||||
}
|
||||
|
||||
const readyElement = await waitForStableSelector(message.payload.checkSelector, 18000);
|
||||
|
||||
if (!readyElement) {
|
||||
return {
|
||||
ok: false,
|
||||
interrupt: {
|
||||
reason: 'page_not_ready',
|
||||
message: '页面关键内容暂未加载,请确认页面是否正常显示后继续',
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const data = await processFields(message.payload.fields, document.body);
|
||||
return { ok: true, data };
|
||||
}
|
||||
|
||||
return { ok: false, error: '未知页面执行指令' };
|
||||
}
|
||||
|
||||
/**
|
||||
* 检测当前页面是否需要用户手动处理登录、验证码或页面不存在。
|
||||
*/
|
||||
function detectPageInterrupt(): CrawlPauseInfo | undefined {
|
||||
if (isShieldPage()) {
|
||||
return {
|
||||
reason: 'shield',
|
||||
message: '检测到验证码或风控验证,请在打开的商家后台窗口处理完成后继续',
|
||||
};
|
||||
}
|
||||
|
||||
if (isLoginPage()) {
|
||||
return {
|
||||
reason: 'reauth',
|
||||
message: '检测到需要重新登录,请在打开的商家后台窗口登录完成后继续',
|
||||
};
|
||||
}
|
||||
|
||||
if (isNotFoundPage()) {
|
||||
return {
|
||||
reason: 'not_found',
|
||||
message: '当前页面不存在或已失效,请确认平台配置里的页面地址是否正确',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否进入验证码、流量盾或风控验证页。
|
||||
*/
|
||||
function isShieldPage(): boolean {
|
||||
const path = location.pathname.toLowerCase();
|
||||
|
||||
if (path.startsWith('/verify/captcha') || path.startsWith('/verify/traffic')) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const shieldElement = document.querySelector(
|
||||
'[data-name="verification"], .ant-captcha, #captchaContainer, [class*="captcha" i], [id*="captcha" i]',
|
||||
);
|
||||
|
||||
return shieldElement ? isVisibleElement(shieldElement) : false;
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断当前页面是否需要登录或二次验证密码。
|
||||
*/
|
||||
function isLoginPage(): boolean {
|
||||
const path = location.pathname.toLowerCase();
|
||||
|
||||
if (
|
||||
/^\/(?:buyer\/)?login\b/i.test(path) ||
|
||||
/^\/account\/(?:signin|login)\b/i.test(path) ||
|
||||
/^\/portal\/login\b/i.test(path)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const visiblePasswordInput = Array.from(document.querySelectorAll('input[type="password"]')).some(isVisibleElement);
|
||||
|
||||
if (visiblePasswordInput) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const bodyText = document.body.innerText.slice(0, 3000);
|
||||
const loginTextPatterns = [
|
||||
/enter\s+(your\s+)?password\s+to\s+continue/i,
|
||||
/sign\s+in\s+(again\s+)?to\s+continue/i,
|
||||
/please\s+(re-?)?enter\s+(your\s+)?password/i,
|
||||
/请(再次|重新)?输入(您的)?密码/,
|
||||
/请登录|重新登录|登录后继续/,
|
||||
];
|
||||
|
||||
return loginTextPatterns.some((pattern) => pattern.test(bodyText));
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断当前页面是否是不存在、下架或错误页面。
|
||||
*/
|
||||
function isNotFoundPage(): boolean {
|
||||
const text = document.body.innerText.slice(0, 8000);
|
||||
const title = document.title;
|
||||
const notFoundPatterns = [
|
||||
/page\s+not\s+found/i,
|
||||
/the\s+page\s+you\s+are\s+looking\s+for/i,
|
||||
/this\s+page\s+(has\s+been\s+)?removed/i,
|
||||
/product\s+(is\s+)?unavailable/i,
|
||||
/页面不存在|找不到(此|该)?页面|抱歉.*不存在|(商品|产品)已下架/,
|
||||
];
|
||||
|
||||
return notFoundPatterns.some((pattern) => pattern.test(title) || pattern.test(text));
|
||||
}
|
||||
|
||||
/**
|
||||
* 等待页面中出现稳定的关键元素。
|
||||
*/
|
||||
async function waitForStableSelector(selector: string, timeoutMs: number): Promise<Element | null> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
|
||||
while (Date.now() < deadline) {
|
||||
const element = document.querySelector(selector);
|
||||
|
||||
if (element && isVisibleElement(element)) {
|
||||
await sleep(600);
|
||||
const stableElement = document.querySelector(selector);
|
||||
return stableElement && isVisibleElement(stableElement) ? stableElement : null;
|
||||
}
|
||||
|
||||
await sleep(500);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断元素是否真实可见。
|
||||
*/
|
||||
function isVisibleElement(element: Element): boolean {
|
||||
if (!element.isConnected) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const style = element.ownerDocument.defaultView?.getComputedStyle(element);
|
||||
|
||||
if (!style || style.display === 'none' || style.visibility === 'hidden' || Number(style.opacity) < 0.05) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const rect = element.getBoundingClientRect();
|
||||
return rect.width > 0 && rect.height > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 简单等待工具。
|
||||
*/
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => {
|
||||
window.setTimeout(resolve, ms);
|
||||
});
|
||||
}
|
||||
@@ -18,7 +18,9 @@ const selectedPlatform = computed(() =>
|
||||
);
|
||||
|
||||
const isLoggedIn = computed(() => token.value !== null);
|
||||
const isCrawling = computed(() => crawlState.value?.status === 'running');
|
||||
const shouldShowCrawlProgress = computed(() =>
|
||||
crawlState.value ? ['running', 'paused', 'completed', 'failed'].includes(crawlState.value.status) : false,
|
||||
);
|
||||
|
||||
onMounted(async () => {
|
||||
token.value = await getToken();
|
||||
@@ -82,6 +84,11 @@ async function handleCancelCrawl() {
|
||||
crawlState.value = response.data ?? null;
|
||||
}
|
||||
|
||||
async function handleResumeCrawl() {
|
||||
const response = await sendBackgroundMessage<CrawlTaskState>({ action: 'RESUME_CRAWL' });
|
||||
crawlState.value = response.data ?? null;
|
||||
}
|
||||
|
||||
async function refreshCrawlState() {
|
||||
const response = await sendBackgroundMessage<CrawlTaskState | null>({ action: 'GET_CRAWL_STATE' });
|
||||
|
||||
@@ -162,18 +169,32 @@ function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data
|
||||
</button>
|
||||
</template>
|
||||
|
||||
<template v-else-if="isCrawling && crawlState">
|
||||
<template v-else-if="shouldShowCrawlProgress && crawlState">
|
||||
<section class="space-y-4">
|
||||
<div class="flex items-center justify-between rounded-md bg-white px-3 py-2 shadow-sm">
|
||||
<div>
|
||||
<p class="text-sm font-medium text-slate-800">{{ crawlState.platformName }}</p>
|
||||
<p class="text-xs text-slate-500">已运行 {{ formatElapsed(elapsedSeconds) }}</p>
|
||||
<p class="text-xs text-slate-500">
|
||||
{{ crawlState.status === 'paused' ? '已暂停' : '已运行 ' + formatElapsed(elapsedSeconds) }}
|
||||
</p>
|
||||
</div>
|
||||
<div class="flex items-center gap-2">
|
||||
<button v-if="crawlState.status === 'paused'" type="button"
|
||||
class="text-xs text-emerald-600 transition hover:text-emerald-700"
|
||||
@click="handleResumeCrawl">
|
||||
继续
|
||||
</button>
|
||||
<button type="button" class="text-xs text-red-600 transition hover:text-red-700"
|
||||
@click="handleCancelCrawl">
|
||||
取消
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div v-if="crawlState.status === 'paused' && crawlState.pause"
|
||||
class="rounded-md border border-amber-200 bg-amber-50 px-3 py-2 text-sm text-amber-800">
|
||||
{{ crawlState.pause.message }}
|
||||
</div>
|
||||
|
||||
<ol class="space-y-3">
|
||||
<li v-for="(step, index) in crawlState.steps" :key="step.uniqueKey"
|
||||
@@ -187,6 +208,8 @@ function sendBackgroundMessage<T>(message: unknown): Promise<{ ok: boolean; data
|
||||
<span class="text-xs">{{ getStepText(step.status) }}</span>
|
||||
</div>
|
||||
<p v-if="step.message" class="mt-1 text-xs">{{ step.message }}</p>
|
||||
<pre v-if="step.result !== undefined"
|
||||
class="mt-2 max-h-32 overflow-auto rounded bg-slate-950 p-2 text-[11px] leading-4 text-slate-100">{{ JSON.stringify(step.result, null, 2) }}</pre>
|
||||
</div>
|
||||
</li>
|
||||
</ol>
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
export type CrawlStepStatus = 'pending' | 'running' | 'success' | 'failed';
|
||||
|
||||
// 整体爬取任务状态。
|
||||
export type CrawlTaskStatus = 'running' | 'completed' | 'failed' | 'canceled';
|
||||
export type CrawlTaskStatus = 'running' | 'paused' | 'completed' | 'failed' | 'canceled';
|
||||
|
||||
// 时间轴中的单个爬取步骤进度。
|
||||
export interface CrawlProgressStep {
|
||||
@@ -14,6 +14,16 @@ export interface CrawlProgressStep {
|
||||
status: CrawlStepStatus;
|
||||
// 状态补充说明,如失败原因。
|
||||
message?: string;
|
||||
// 当前步骤抓取到的数据结果。
|
||||
result?: unknown;
|
||||
}
|
||||
|
||||
// 爬取暂停原因,通常由登录、验证码或页面不存在触发。
|
||||
export interface CrawlPauseInfo {
|
||||
// 暂停原因编码。
|
||||
reason: 'reauth' | 'shield' | 'not_found' | 'page_not_ready';
|
||||
// 展示给用户看的处理提示。
|
||||
message: string;
|
||||
}
|
||||
|
||||
// 当前正在执行的爬取任务快照,供 popup 和 content script 同步展示。
|
||||
@@ -30,6 +40,8 @@ export interface CrawlTaskState {
|
||||
startedAt: number;
|
||||
// 当前任务状态。
|
||||
status: CrawlTaskStatus;
|
||||
// 暂停信息;仅 status 为 paused 时存在。
|
||||
pause?: CrawlPauseInfo;
|
||||
// 当前执行到的步骤下标。
|
||||
currentStepIndex: number;
|
||||
// 平台 steps 映射出的时间轴进度。
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
export type {
|
||||
CrawlPauseInfo,
|
||||
CrawlProgressStep,
|
||||
CrawlStepStatus,
|
||||
CrawlTaskState,
|
||||
|
||||
@@ -23,18 +23,8 @@ export interface PlatformPaginationConfig {
|
||||
|
||||
// 表格分段配置,用于兼容一个数据块由多个 table 或多个 table 片段组成的情况。
|
||||
export interface PlatformTablePartConfig {
|
||||
// 当前 table 或表格片段的名称。
|
||||
label: string;
|
||||
// 当前 table 或表格片段的兼容名称,兼容 message.js 中的 name 写法。
|
||||
name?: string;
|
||||
// 当前 table 或表格片段的 CSS 选择器。
|
||||
className: string;
|
||||
// 当前 table 或表格片段的兼容选择器,兼容 message.js 中的 select 写法。
|
||||
select?: string;
|
||||
// 行元素选择器,不填时由采集逻辑使用默认行选择器。
|
||||
rowSelector?: string;
|
||||
// 当前 table 或表格片段下需要采集的字段。
|
||||
keys?: PlatformFieldConfig[];
|
||||
}
|
||||
|
||||
// 页面字段配置,描述一个普通元素、列表元素或表格元素如何从 DOM 中提取数据。
|
||||
|
||||
52
step.md
52
step.md
@@ -1,52 +0,0 @@
|
||||
# 项目结构
|
||||
```angular2html
|
||||
src:.
|
||||
├─assets # 静态资源目录
|
||||
│ vite.svg # 这里的资源通常用于图标、Logo 或扩展程序内部引用的图片
|
||||
│
|
||||
├─background # 后台脚本 (Background Script / Service Worker)
|
||||
│ index.ts # 扩展的“大脑”,常驻后台运行,处理事件监听、报文转发、存储管理等
|
||||
│
|
||||
├─config # 配置目录
|
||||
│ platforms.ts # 自定义配置,各种平台(如不同网站、不同浏览器)的适配配置
|
||||
│
|
||||
├─content # 内容脚本 (Content Script)
|
||||
│ │ App.vue # 注入到网页中的 UI 组件(通常用于在目标页面侧边栏或浮窗显示界面)
|
||||
│ │ main.ts # 内容脚本的入口文件,负责将 Vue 组件挂载到宿主页面的 DOM 中
|
||||
│ │
|
||||
│ └─views # 内容脚本相关的子视图或组件
|
||||
│
|
||||
├─options # 选项页 (Options Page)
|
||||
│ App.vue # 扩展设置页面的 UI(右键扩展图标点击“选项”打开的页面)
|
||||
│ index.html # 选项页的 HTML 宿主文件
|
||||
│ main.ts # 选项页的 Vue 入口文件
|
||||
│
|
||||
├─popup # 弹窗页 (Popup Page)
|
||||
│ App.vue # 点击扩展图标时显示的弹出框 UI
|
||||
│ index.html # 弹窗页的 HTML 宿主文件
|
||||
│ main.ts # 弹窗页的 Vue 入口文件
|
||||
│
|
||||
├─shared # 共享代码库 (Shared)
|
||||
│ # 存放被 background、content、popup 等多个模块共同引用的工具函数、常量、API封装等
|
||||
│
|
||||
└─types # 类型定义目录
|
||||
index.ts # 存放全局的 TypeScript 接口(Interface)和类型(Type)定义
|
||||
```
|
||||
|
||||
# 开发步骤
|
||||
1.在popup模块中的App.vue中用tailwindcss编写,点击扩展图标时出现的弹窗,逻辑如下
|
||||
- 在未登录情况下,即storage中token字段是否存在,如果不存在,弹窗内容只用显示扩展名字、描述、请登录按钮,底部扩展版本
|
||||
- 当点击登录按钮后,先模拟登录,写死token,之后ui如下
|
||||
- 显示扩展名字、描述、一个平台选择框(通过读取config/platforms.ts)的内容for循环显示平台、扫描按钮、最底部Row(退出按钮,扩展版本号)
|
||||
- 注意:token的存储和获取逻辑放到/shared/auth.ts中去,如果涉及到接口和枚举的定义,请判断是否是全局类型
|
||||
- 如果是,该类型写到一个新文件中,并放到types/下,如果不是,放到当前模块的types/目录下(如果没用,新建)
|
||||
|
||||
2.前提:当1完成后,点击popup的立即爬取已经可以打开一个新的窗口了
|
||||
- 在所有网页(包括新打开的窗口和所有网页)的右下角都放一个圆形正计时(表示正在爬取中)
|
||||
- 点击圆形正计时时,出现一个popup,内容如下
|
||||
- 以时间轴的形式,表示当前爬取进度,即:根据platforms.ts中的steps
|
||||
- 同时点击扩展的popup里的内容,也变得和上面的时间轴内容一致,显示爬取进度,隐藏立即爬取等按钮,
|
||||
|
||||
3.前提:1和2都已完成,ui和交互操作上ok
|
||||
- 开始爬取网页中的数据,查看message.js内容,吧里面的爬取方法都提取出来放到background/domScraper.ts中去,
|
||||
- 基于2,每次根据steps打开一个新网页后,根据它的fields数组字段,调用domScraper中的方法,来提取数据,并打印到控制台即可
|
||||
@@ -1 +1 @@
|
||||
{"root":["./manifest.config.ts","./message.js","./vite.config.ts","./src/background/domscraper.ts","./src/background/index.ts","./src/background/service.ts","./src/background/types.ts","./src/background/service/crawltask.ts","./src/background/service/lifecycle.ts","./src/background/service/taskstate.ts","./src/config/platforms.ts","./src/content/app.vue","./src/content/main.ts","./src/options/app.vue","./src/options/main.ts","./src/popup/app.vue","./src/popup/main.ts","./src/shared/auth.ts","./src/types/crawl.ts","./src/types/index.ts","./src/types/platform.ts"],"version":"5.9.3"}
|
||||
{"root":["./manifest.config.ts","./message.js","./vite.config.ts","./src/background/domscraper.ts","./src/background/index.ts","./src/background/service.ts","./src/background/types.ts","./src/background/service/crawltask.ts","./src/background/service/lifecycle.ts","./src/background/service/taskstate.ts","./src/config/platforms.ts","./src/content/app.vue","./src/content/main.ts","./src/content/pagerunner.ts","./src/options/app.vue","./src/options/main.ts","./src/popup/app.vue","./src/popup/main.ts","./src/shared/auth.ts","./src/types/crawl.ts","./src/types/index.ts","./src/types/platform.ts","./storeai-extension-v0.1.0/service-worker-loader.js","./storeai-extension-v0.1.0/assets/config-cf-xklo9.js","./storeai-extension-v0.1.0/assets/fetch-hook.ts-bvrghr__.js","./storeai-extension-v0.1.0/assets/index-dxg1qimp.js","./storeai-extension-v0.1.0/assets/index.ts-dirvxn_b.js","./storeai-extension-v0.1.0/assets/orchestrator.ts-bleul1fk.js","./storeai-extension-v0.1.0/assets/orchestrator.ts-loader-drev6v6h.js","./storeai-extension-v0.1.0/assets/popup-dbgvbs2c.js","./storeai-extension-v0.1.0/assets/selectors-xrdds_u0.js"],"version":"5.9.3"}
|
||||
Reference in New Issue
Block a user