From c1355385e1a0b55a3d8ad6272c083875caab5d3d Mon Sep 17 00:00:00 2001 From: nianzhibai <177086871+nianzhibai@users.noreply.github.com> Date: Wed, 10 Jun 2026 14:22:47 +0800 Subject: [PATCH] feat(crawler): simplify script crawler workflow Redesign crawler management around imported Python scripts instead of built-in crawler storage. Crawler scripts now declare CRAWLER_NAME, imports validate metadata, crawler IDs are generated internally, and deleted crawler scripts are detached without deleting already imported videos. Add backend support for file and URL script imports, dry-run testing, metadata parsing, safer job paths, original filename preservation, and crawler listing that ignores detached script records. Remove the legacy built-in Spider91 script path flow and hidden Python/config JSON fields from the crawler API. Rework the admin crawler page into an independent crawler console with script import, dry-run testing, status metrics, spider iconography, and simplified controls. Update docs, examples, installer checks, Docker/release packaging, and tests for the new protocol. --- 91VideoSpider/spider_91porn.py | 1 + Dockerfile | 1 - README.md | 2 +- backend/README.md | 2 +- backend/cmd/server/main.go | 32 +- backend/cmd/server/main_test.go | 3 +- backend/internal/api/admin.go | 274 ++++++-- backend/internal/api/admin_test.go | 368 ++++++++++- .../internal/drives/scriptcrawler/crawler.go | 16 +- .../drives/scriptcrawler/crawler_test.go | 58 ++ .../internal/drives/scriptcrawler/dryrun.go | 375 +++++++++++ .../drives/scriptcrawler/dryrun_test.go | 153 +++++ .../internal/drives/scriptcrawler/metadata.go | 117 ++++ .../drives/scriptcrawler/metadata_test.go | 39 ++ deploy.sh | 8 +- docs/crawler-protocol.md | 14 +- examples/crawlers/simple_crawler.py | 2 + install.sh | 2 +- scripts/build-release.sh | 2 - src/admin/CrawlersPage.tsx | 479 ++++++++++---- src/admin/api.ts | 48 +- src/admin/drive/constants.ts | 2 +- src/styles/admin.css | 618 +++++++++++++++++- tests/adminDriveForm.test.ts | 13 +- 24 files changed, 2355 insertions(+), 274 deletions(-) create mode 100644 backend/internal/drives/scriptcrawler/dryrun.go create mode 100644 backend/internal/drives/scriptcrawler/dryrun_test.go create mode 100644 backend/internal/drives/scriptcrawler/metadata.go create mode 100644 backend/internal/drives/scriptcrawler/metadata_test.go diff --git a/91VideoSpider/spider_91porn.py b/91VideoSpider/spider_91porn.py index 6140129..82b12d5 100644 --- a/91VideoSpider/spider_91porn.py +++ b/91VideoSpider/spider_91porn.py @@ -154,6 +154,7 @@ OUTPUT_FILE = "91porn_videos.json" MAX_PAGES = None # 设置为 None 爬取所有页,或设置整数如 5 只爬前5页 RESUME = True # 是否跳过输出文件中已存在的 viewkey (断点续爬) MAX_EMPTY_PAGES = 2 # 连续空页数达到此值时停止爬取 +CRAWLER_NAME = "91Porn" CRAWLER_PROTOCOL = "crawler.v1" # =================================================== diff --git a/Dockerfile b/Dockerfile index d5faca6..e90d54a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,7 +48,6 @@ WORKDIR /opt/video-site-91 COPY --from=backend /out/server ./server COPY --from=frontend /app/dist ./dist COPY backend/config.example.yaml ./config.example.yaml -COPY 91VideoSpider/ ./91VideoSpider/ COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh ARG VERSION=dev diff --git a/README.md b/README.md index 1f90854..bca8371 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ - **多后端支持** — 兼容 115 云盘、PikPak 云盘、123云盘、OneDrive、Google Drive 和本地存储 - **低带宽播放** — 115 云盘、PikPak 云盘、123云盘、OneDrive 都支持302模式,在线播放视频时,不占用服务器带宽,播放体验不受服务器带宽影响;Google Drive 不支持302模式,走服务器中转,观看体验会受服务器带宽影响 - **封面 & 预览片段** — 自动为每个视频生成封面图和预览片段,首页快速选片 -- **91 爬虫** — 内置爬虫,支持抓取 91 本月最热视频 +- **脚本爬虫** — 不内置任何爬虫,支持在后台导入自定义爬虫脚本(上传 `.py` 文件 / 链接导入 / 服务器路径),按统一协议抓取视频 - **双主题** — 黑黄经典主题 / 粉白清新主题,随时切换 - **短视频模式** — 一键切换抖音风格,沉浸刷片 - **低资源占用** — 2C2G 服务器稳定运行,主要性能消耗就是封面图和预览视频的生成 diff --git a/backend/README.md b/backend/README.md index 1e8e236..09455b5 100644 --- a/backend/README.md +++ b/backend/README.md @@ -84,7 +84,7 @@ go run ./cmd/server 后端 9192 爬虫现在是独立后台栏目 `/admin/crawlers`,不再作为“网盘/存储类型”配置。脚本负责发现视频,后端负责去重、下载、入库、封面、预览视频和视频指纹。 -脚本协议见 [docs/crawler-protocol.md](../docs/crawler-protocol.md)。后台支持上传 `.py` 文件或通过 HTTP(S) 脚本链接导入,导入后的脚本会保存到数据目录旁的 `crawler-scripts/`。内置 91 爬虫也支持同一套 `crawler.v1` job 协议;后台“内置 91”会自动使用仓库里的 `91VideoSpider/spider_91porn.py`。 +脚本协议见 [docs/crawler-protocol.md](../docs/crawler-protocol.md)。后台支持上传 `.py` 文件或通过 HTTP(S) 脚本链接导入,导入后的脚本会保存到数据目录旁的 `crawler-scripts/`。脚本必须声明 `CRAWLER_NAME`,后台会自动读取它作为爬虫名称。项目不内置任何爬虫脚本,所有爬虫都由用户自行导入。 ## 添加一个盘 diff --git a/backend/cmd/server/main.go b/backend/cmd/server/main.go index 646dddb..3ac03a1 100644 --- a/backend/cmd/server/main.go +++ b/backend/cmd/server/main.go @@ -239,9 +239,6 @@ func main() { SetSpider91UploadDriveID: func(id string) error { return app.SetSpider91UploadDriveID(ctx, id) }, - DefaultSpider91ScriptPath: func() string { - return app.defaultSpider91ScriptPath() - }, OnRunNightlyJob: func() bool { if app.nightlyRunner != nil { return app.nightlyRunner.TriggerNow() @@ -881,30 +878,6 @@ func (a *App) commonThumbsDir() string { return filepath.Join(a.cfg.Storage.LocalPreviewDir, "thumbs") } -// defaultSpider91ScriptPath 推断仓库里爬虫脚本的默认路径。 -// 当前进程从 backend/ 启动时,脚本位于 ../91VideoSpider/spider_91porn.py。 -// 找不到时返回空字符串,上层会在 RunOnce 时报错提示用户手动填 script_path。 -func (a *App) defaultSpider91ScriptPath() string { - candidates := []string{ - // 优先从配置目录的父目录定位 - filepath.Join(filepath.Dir(filepath.Dir(a.cfg.Storage.LocalPreviewDir)), "91VideoSpider", "spider_91porn.py"), - // 仓库 root(cwd 在 backend/ 时) - filepath.Join("..", "91VideoSpider", "spider_91porn.py"), - // cwd 已经是仓库 root 时 - filepath.Join("91VideoSpider", "spider_91porn.py"), - } - for _, p := range candidates { - abs, err := filepath.Abs(p) - if err != nil { - continue - } - if _, err := os.Stat(abs); err == nil { - return abs - } - } - return "" -} - // attachScriptCrawler 创建通用脚本爬虫 runner,并注册到 a.scriptCrawlers。 func (a *App) attachScriptCrawler(d *catalog.Drive, drv *scriptcrawler.Driver) { pythonPath := strings.TrimSpace(d.Credentials["python_path"]) @@ -913,9 +886,6 @@ func (a *App) attachScriptCrawler(d *catalog.Drive, drv *scriptcrawler.Driver) { } scriptPath := strings.TrimSpace(d.Credentials["script_path"]) sourceKind := scriptCrawlerSourceKindForDrive(d) - if scriptPath == "" && sourceKind == spider91.Kind { - scriptPath = a.defaultSpider91ScriptPath() - } proxyURL := strings.TrimSpace(d.Credentials["proxy"]) configJSON := strings.TrimSpace(d.Credentials["config_json"]) workDir := "" @@ -2442,7 +2412,7 @@ func (a *App) listSpider91DriveIDs(ctx context.Context) []string { } out := make([]string, 0, len(all)) for _, d := range all { - if d != nil && d.Kind == scriptcrawler.Kind { + if d != nil && d.Kind == scriptcrawler.Kind && strings.TrimSpace(d.Credentials["script_path"]) != "" { out = append(out, d.ID) } } diff --git a/backend/cmd/server/main_test.go b/backend/cmd/server/main_test.go index 4004ab8..a06ac35 100644 --- a/backend/cmd/server/main_test.go +++ b/backend/cmd/server/main_test.go @@ -609,7 +609,8 @@ func TestNightlyTargetsComeFromCatalogBeforeDriveAttach(t *testing.T) { {ID: "115", Kind: "p115", Name: "115", RootID: "0", TeaserEnabled: true}, {ID: "pikpak", Kind: "pikpak", Name: "PikPak", RootID: "0", TeaserEnabled: true}, {ID: "91-legacy", Kind: "spider91", Name: "91 Legacy", RootID: "0", TeaserEnabled: true}, - {ID: "91-crawler", Kind: scriptcrawler.Kind, Name: "91 Spider", RootID: "/", TeaserEnabled: true}, + {ID: "91-crawler", Kind: scriptcrawler.Kind, Name: "91 Spider", RootID: "/", Credentials: map[string]string{"script_path": "/tmp/crawler.py"}, TeaserEnabled: true}, + {ID: "91-crawler-deleted", Kind: scriptcrawler.Kind, Name: "Deleted Spider", RootID: "/", Credentials: map[string]string{}, TeaserEnabled: true}, } { if err := cat.UpsertDrive(ctx, d); err != nil { t.Fatalf("seed drive %s: %v", d.ID, err) diff --git a/backend/internal/api/admin.go b/backend/internal/api/admin.go index 58c90af..35b4f67 100644 --- a/backend/internal/api/admin.go +++ b/backend/internal/api/admin.go @@ -68,9 +68,6 @@ type AdminServer struct { // Spider91 → 115/123/PikPak/OneDrive 上传目标 drive ID 读写 GetSpider91UploadDriveID func() string SetSpider91UploadDriveID func(driveID string) error - // DefaultSpider91ScriptPath returns the built-in Spider91 crawler script - // path for the independent crawler management UI. - DefaultSpider91ScriptPath func() string // OnRunNightlyJob 触发一次完整的凌晨流水线(Phase1 扫盘 + Phase2 91 爬虫 + // Phase3 迁移)。立即返回 —— 实际任务在后台跑,admin 在日志或下次状态查询里 // 看进度。若流水线正在跑或已排队,Runner 会拒绝重复触发。 @@ -163,6 +160,7 @@ func (a *AdminServer) Register(r chi.Router) { r.Post("/crawlers", a.handleUpsertCrawler) r.Post("/crawlers/import-file", a.handleImportCrawlerScriptFile) r.Post("/crawlers/import-url", a.handleImportCrawlerScriptURL) + r.Post("/crawlers/test-script", a.handleTestCrawlerScript) r.Delete("/crawlers/{id}", a.handleDeleteCrawler) r.Post("/crawlers/{id}/run", a.handleRunCrawler) r.Post("/crawlers/{id}/tasks/stop", a.handleStopCrawlerTasks) @@ -441,11 +439,6 @@ func (a *AdminServer) handleListDrives(w http.ResponseWriter, r *http.Request) { // LastCrawlAt 是 spider91 上次成功爬取的 unix 秒(来自 credentials.last_crawl_at)。 // 其它 kind 留 0;前端用它显示"上次抓取: N 小时前"。 Spider91Proxy string `json:"spider91Proxy,omitempty"` - ScriptCrawlerPythonPath string `json:"scriptCrawlerPythonPath,omitempty"` - ScriptCrawlerScriptPath string `json:"scriptCrawlerScriptPath,omitempty"` - ScriptCrawlerProxy string `json:"scriptCrawlerProxy,omitempty"` - ScriptCrawlerTargetNew string `json:"scriptCrawlerTargetNew,omitempty"` - ScriptCrawlerConfigJSON string `json:"scriptCrawlerConfigJson,omitempty"` LastCrawlAt int64 `json:"lastCrawlAt,omitempty"` GoogleDriveUseOnlineAPI *bool `json:"googleDriveUseOnlineAPI,omitempty"` ScanGenerationStatus GenerationStatus `json:"scanGenerationStatus"` @@ -513,11 +506,6 @@ func (a *AdminServer) handleListDrives(w http.ResponseWriter, r *http.Request) { TeaserEnabled: d.TeaserEnabled, SkipDirIDs: append([]string{}, d.SkipDirIDs...), Spider91Proxy: spider91ProxyForDrive(d), - ScriptCrawlerPythonPath: scriptCrawlerCred(d, "python_path"), - ScriptCrawlerScriptPath: scriptCrawlerCred(d, "script_path"), - ScriptCrawlerProxy: scriptCrawlerCred(d, "proxy"), - ScriptCrawlerTargetNew: scriptCrawlerCred(d, "target_new"), - ScriptCrawlerConfigJSON: scriptCrawlerCred(d, "config_json"), LastCrawlAt: lastCrawlAt, GoogleDriveUseOnlineAPI: googleDriveUseOnlineAPIForDrive(d), ScanGenerationStatus: generation.Scan, @@ -637,14 +625,11 @@ type crawlerDTO struct { ID string `json:"id"` Name string `json:"name"` Kind string `json:"kind"` - Builtin string `json:"builtin,omitempty"` Status string `json:"status"` LastError string `json:"lastError,omitempty"` ScriptPath string `json:"scriptPath"` - PythonPath string `json:"pythonPath,omitempty"` Proxy string `json:"proxy,omitempty"` TargetNew string `json:"targetNew,omitempty"` - ConfigJSON string `json:"configJson,omitempty"` LastCrawlAt int64 `json:"lastCrawlAt,omitempty"` ScanGenerationStatus GenerationStatus `json:"scanGenerationStatus"` ThumbnailGenerationStatus GenerationStatus `json:"thumbnailGenerationStatus"` @@ -663,13 +648,9 @@ type crawlerDTO struct { type upsertCrawlerReq struct { ID string `json:"id"` - Name string `json:"name"` - Builtin string `json:"builtin"` ScriptPath string `json:"scriptPath"` - PythonPath string `json:"pythonPath"` Proxy string `json:"proxy"` TargetNew string `json:"targetNew"` - ConfigJSON string `json:"configJson"` } func (a *AdminServer) handleListCrawlers(w http.ResponseWriter, r *http.Request) { @@ -700,7 +681,7 @@ func (a *AdminServer) handleListCrawlers(w http.ResponseWriter, r *http.Request) out := []crawlerDTO{} for _, d := range all { - if d == nil || !isCrawlerDriveKind(d.Kind) { + if d == nil || !isConfiguredCrawlerDrive(d) { continue } out = append(out, a.crawlerDTOForDrive(d, teaserCounts[d.ID], thumbnailCounts[d.ID], fingerprintCounts[d.ID], generationStatuses[d.ID])) @@ -729,16 +710,13 @@ func (a *AdminServer) crawlerDTOForDrive(d *catalog.Drive, teaser catalog.DriveT } return crawlerDTO{ ID: d.ID, - Name: d.Name, + Name: crawlerNameForDrive(d), Kind: d.Kind, - Builtin: crawlerBuiltinForDrive(d), Status: d.Status, LastError: d.LastError, ScriptPath: strings.TrimSpace(d.Credentials["script_path"]), - PythonPath: strings.TrimSpace(d.Credentials["python_path"]), Proxy: strings.TrimSpace(d.Credentials["proxy"]), TargetNew: strings.TrimSpace(d.Credentials["target_new"]), - ConfigJSON: strings.TrimSpace(d.Credentials["config_json"]), LastCrawlAt: lastCrawlAt, ScanGenerationStatus: generation.Scan, ThumbnailGenerationStatus: generation.Thumbnail, @@ -756,11 +734,16 @@ func (a *AdminServer) crawlerDTOForDrive(d *catalog.Drive, teaser catalog.DriveT } } -func crawlerBuiltinForDrive(d *catalog.Drive) string { +func crawlerNameForDrive(d *catalog.Drive) string { if d == nil { return "" } - return strings.TrimSpace(d.Credentials["builtin"]) + if d.Credentials != nil { + if meta, err := scriptcrawler.ReadMetadata(strings.TrimSpace(d.Credentials["script_path"])); err == nil { + return meta.Name + } + } + return strings.TrimSpace(d.Name) } func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request) { @@ -770,32 +753,21 @@ func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request return } id := strings.TrimSpace(body.ID) - name := strings.TrimSpace(body.Name) - if id == "" || name == "" { - http.Error(w, "id and name are required", http.StatusBadRequest) - return - } - existing, _ := a.Catalog.GetDrive(r.Context(), id) creds := map[string]string{} + var existing *catalog.Drive + if id != "" { + existing, _ = a.Catalog.GetDrive(r.Context(), id) + } if existing != nil { for k, v := range existing.Credentials { creds[k] = v } } - builtin := strings.TrimSpace(body.Builtin) - if builtin != "" { - creds["builtin"] = builtin - } scriptPath := strings.TrimSpace(body.ScriptPath) - if scriptPath == "" && builtin == "spider91" && a.DefaultSpider91ScriptPath != nil { - scriptPath = strings.TrimSpace(a.DefaultSpider91ScriptPath()) - } incoming := map[string]string{ "script_path": scriptPath, - "python_path": strings.TrimSpace(body.PythonPath), "proxy": strings.TrimSpace(body.Proxy), "target_new": strings.TrimSpace(body.TargetNew), - "config_json": strings.TrimSpace(body.ConfigJSON), } for k, v := range incoming { creds[k] = v @@ -805,8 +777,19 @@ func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request http.Error(w, err.Error(), http.StatusBadRequest) return } - if builtin != "" { - merged["builtin"] = builtin + meta, err := scriptcrawler.ReadMetadata(merged["script_path"]) + if err != nil { + http.Error(w, "脚本元信息无效:"+err.Error(), http.StatusBadRequest) + return + } + name := meta.Name + if id == "" { + generatedID, err := a.generateCrawlerID(r.Context(), name) + if err != nil { + writeErr(w, http.StatusInternalServerError, err) + return + } + id = generatedID } d := &catalog.Drive{ ID: id, @@ -826,11 +809,55 @@ func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request } if a.OnDriveSaved != nil { if err := a.OnDriveSaved(id); err != nil { - writeJSON(w, http.StatusOK, map[string]any{"ok": true, "warning": err.Error()}) + writeJSON(w, http.StatusOK, map[string]any{"ok": true, "id": id, "warning": err.Error()}) return } } - writeJSON(w, http.StatusOK, map[string]any{"ok": true}) + writeJSON(w, http.StatusOK, map[string]any{"ok": true, "id": id}) +} + +func (a *AdminServer) generateCrawlerID(ctx context.Context, name string) (string, error) { + all, err := a.Catalog.ListDrives(ctx) + if err != nil { + return "", err + } + used := map[string]bool{} + for _, d := range all { + if d == nil { + continue + } + if isCrawlerDriveKind(d.Kind) && strings.TrimSpace(d.Credentials["script_path"]) == "" { + continue + } + used[d.ID] = true + } + slug := crawlerIDSlug(name) + base := "crawler" + if slug != "" { + base += "-" + slug + } + candidate := base + for suffix := 2; used[candidate]; suffix++ { + candidate = fmt.Sprintf("%s-%d", base, suffix) + } + return candidate, nil +} + +func crawlerIDSlug(raw string) string { + var b strings.Builder + lastDash := false + for _, r := range strings.ToLower(raw) { + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') { + b.WriteRune(r) + lastDash = false + continue + } + if b.Len() > 0 && !lastDash { + b.WriteByte('-') + lastDash = true + } + } + return strings.Trim(b.String(), "-") } type importCrawlerScriptURLReq struct { @@ -838,6 +865,36 @@ type importCrawlerScriptURLReq struct { FileName string `json:"fileName"` } +type testCrawlerScriptReq struct { + ScriptPath string `json:"scriptPath"` + Proxy string `json:"proxy"` +} + +// handleTestCrawlerScript 试跑一个爬虫脚本:不入库,抓到第一条视频 +// (并探测直链可达)即返回,让用户在保存前确认脚本能爬到视频。 +func (a *AdminServer) handleTestCrawlerScript(w http.ResponseWriter, r *http.Request) { + var body testCrawlerScriptReq + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + writeErr(w, http.StatusBadRequest, err) + return + } + scriptPath := strings.TrimSpace(body.ScriptPath) + if scriptPath == "" { + http.Error(w, "请先导入爬虫脚本", http.StatusBadRequest) + return + } + proxyURL, err := normalizeCrawlerProxyURL(body.Proxy, "脚本爬虫") + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + result := scriptcrawler.DryRun(r.Context(), scriptcrawler.DryRunConfig{ + ScriptPath: scriptPath, + ProxyURL: proxyURL, + }) + writeJSON(w, http.StatusOK, result) +} + func (a *AdminServer) handleImportCrawlerScriptFile(w http.ResponseWriter, r *http.Request) { r.Body = http.MaxBytesReader(w, r.Body, maxCrawlerScriptBytes+1024*1024) if err := r.ParseMultipartForm(maxCrawlerScriptBytes + 1024*1024); err != nil { @@ -860,7 +917,13 @@ func (a *AdminServer) handleImportCrawlerScriptFile(w http.ResponseWriter, r *ht writeErr(w, http.StatusBadRequest, err) return } - writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath}) + meta, err := scriptcrawler.ReadMetadata(scriptPath) + if err != nil { + _ = os.Remove(scriptPath) + writeErr(w, http.StatusBadRequest, fmt.Errorf("脚本元信息无效: %w", err)) + return + } + writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath, "name": meta.Name}) } func (a *AdminServer) handleImportCrawlerScriptURL(w http.ResponseWriter, r *http.Request) { @@ -917,7 +980,13 @@ func (a *AdminServer) handleImportCrawlerScriptURL(w http.ResponseWriter, r *htt writeErr(w, http.StatusBadRequest, err) return } - writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath}) + meta, err := scriptcrawler.ReadMetadata(scriptPath) + if err != nil { + _ = os.Remove(scriptPath) + writeErr(w, http.StatusBadRequest, fmt.Errorf("脚本元信息无效: %w", err)) + return + } + writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath, "name": meta.Name}) } func (a *AdminServer) saveCrawlerScript(ctx context.Context, name string, r io.Reader, maxBytes int64) (string, error) { @@ -935,7 +1004,7 @@ func (a *AdminServer) saveCrawlerScript(ctx context.Context, name string, r io.R if err := os.MkdirAll(root, 0o755); err != nil { return "", err } - dst := filepath.Join(root, time.Now().UTC().Format("20060102T150405.000000000Z")+"-"+fileName) + dst := filepath.Join(root, fileName) dstAbs, err := filepath.Abs(dst) if err != nil { return "", err @@ -1015,6 +1084,11 @@ func safeCrawlerScriptFileName(raw string) (string, error) { func (a *AdminServer) handleRunCrawler(w http.ResponseWriter, r *http.Request) { id := chi.URLParam(r, "id") + d, err := a.Catalog.GetDrive(r.Context(), id) + if err != nil || d == nil || !isCrawlerDriveKind(d.Kind) || d.Credentials == nil || strings.TrimSpace(d.Credentials["script_path"]) == "" { + http.Error(w, "crawler not found", http.StatusNotFound) + return + } status := a.nightlyJobStatus() if status.Running || status.Queued { writeJSON(w, http.StatusAccepted, map[string]any{ @@ -1041,13 +1115,86 @@ func (a *AdminServer) handleStopCrawlerTasks(w http.ResponseWriter, r *http.Requ } func (a *AdminServer) handleDeleteCrawler(w http.ResponseWriter, r *http.Request) { - a.handleDeleteDrive(w, r) + id := chi.URLParam(r, "id") + d, err := a.Catalog.GetDrive(r.Context(), id) + if err != nil { + writeErr(w, http.StatusNotFound, err) + return + } + if !isCrawlerDriveKind(d.Kind) { + http.Error(w, "crawler not found", http.StatusNotFound) + return + } + if a.OnStopDriveTasks != nil { + a.OnStopDriveTasks(id) + } + + deletedScript, scriptErr := a.removeImportedCrawlerScript(d) + if d.Credentials == nil { + d.Credentials = map[string]string{} + } + delete(d.Credentials, "script_path") + delete(d.Credentials, "proxy") + delete(d.Credentials, "target_new") + delete(d.Credentials, "builtin") + delete(d.Credentials, "python_path") + delete(d.Credentials, "config_json") + d.Status = "disconnected" + d.LastError = "" + if err := a.Catalog.UpsertDrive(r.Context(), d); err != nil { + writeErr(w, http.StatusInternalServerError, err) + return + } + resp := map[string]any{ + "ok": true, + "deletedVideos": 0, + "deletedScript": deletedScript, + } + if scriptErr != nil { + resp["warning"] = scriptErr.Error() + } + writeJSON(w, http.StatusOK, resp) } func isCrawlerDriveKind(kind string) bool { return kind == scriptcrawler.Kind } +func isConfiguredCrawlerDrive(d *catalog.Drive) bool { + return d != nil && + isCrawlerDriveKind(d.Kind) && + d.Credentials != nil && + strings.TrimSpace(d.Credentials["script_path"]) != "" +} + +func (a *AdminServer) removeImportedCrawlerScript(d *catalog.Drive) (bool, error) { + if d == nil || d.Credentials == nil { + return false, nil + } + scriptPath := strings.TrimSpace(d.Credentials["script_path"]) + if scriptPath == "" { + return false, nil + } + scriptAbs, err := filepath.Abs(scriptPath) + if err != nil { + return false, err + } + rootAbs, err := a.crawlerScriptImportDir() + if err != nil { + return false, err + } + if scriptAbs == rootAbs || !strings.HasPrefix(scriptAbs, rootAbs+string(os.PathSeparator)) { + return false, nil + } + if err := os.Remove(scriptAbs); err != nil { + if errors.Is(err, os.ErrNotExist) { + return false, nil + } + return false, err + } + return true, nil +} + func spider91ProxyForDrive(d *catalog.Drive) string { if d == nil || d.Kind != "spider91" || d.Credentials == nil { return "" @@ -1055,13 +1202,6 @@ func spider91ProxyForDrive(d *catalog.Drive) string { return strings.TrimSpace(d.Credentials["proxy"]) } -func scriptCrawlerCred(d *catalog.Drive, key string) string { - if d == nil || d.Kind != scriptcrawler.Kind || d.Credentials == nil { - return "" - } - return strings.TrimSpace(d.Credentials[key]) -} - func googleDriveUseOnlineAPIForDrive(d *catalog.Drive) *bool { if d == nil || d.Kind != "googledrive" { return nil @@ -1165,23 +1305,16 @@ func mergeScriptCrawlerCredentials(existing *catalog.Drive, incoming map[string] return nil, fmt.Errorf("脚本爬虫 target_new 必须是正整数") } merged[key] = strconv.Itoa(n) - case "config_json": + case "script_path": if value == "" { - delete(merged, key) - continue - } - if !json.Valid([]byte(value)) { - return nil, fmt.Errorf("脚本爬虫自定义配置必须是合法 JSON") - } - merged[key] = value - case "python_path", "script_path": - if value == "" { - if existing == nil || key == "script_path" { + if existing == nil { delete(merged, key) } continue } merged[key] = value + case "builtin", "python_path", "config_json": + delete(merged, key) default: if value == "" { delete(merged, key) @@ -1190,9 +1323,12 @@ func mergeScriptCrawlerCredentials(existing *catalog.Drive, incoming map[string] } } } - if strings.TrimSpace(merged["script_path"]) == "" && !strings.EqualFold(strings.TrimSpace(merged["builtin"]), "spider91") { + if strings.TrimSpace(merged["script_path"]) == "" { return nil, fmt.Errorf("脚本爬虫必须填写 script_path") } + delete(merged, "builtin") + delete(merged, "python_path") + delete(merged, "config_json") return merged, nil } diff --git a/backend/internal/api/admin_test.go b/backend/internal/api/admin_test.go index da48508..5c3ef66 100644 --- a/backend/internal/api/admin_test.go +++ b/backend/internal/api/admin_test.go @@ -5,10 +5,12 @@ import ( "context" "database/sql" "encoding/json" + "errors" "mime/multipart" "net/http" "net/http/httptest" "os" + "os/exec" "path/filepath" "strconv" "strings" @@ -19,6 +21,7 @@ import ( "github.com/video-site/backend/internal/auth" "github.com/video-site/backend/internal/catalog" + "github.com/video-site/backend/internal/drives/scriptcrawler" ) func TestHandleLoginReturnsForbiddenForBannedIP(t *testing.T) { @@ -843,7 +846,8 @@ func TestHandleDeleteDriveRequiresCleanupConfirmation(t *testing.T) { func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) { ctx := context.Background() - cat, err := catalog.Open(t.TempDir() + "/catalog.db") + tmp := t.TempDir() + cat, err := catalog.Open(filepath.Join(tmp, "catalog.db")) if err != nil { t.Fatalf("open catalog: %v", err) } @@ -852,6 +856,10 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) { t.Fatalf("close catalog: %v", err) } }) + scriptPath := filepath.Join(tmp, "spider_91porn.py") + if err := os.WriteFile(scriptPath, []byte("CRAWLER_NAME = \"91Porn\"\n"), 0o644); err != nil { + t.Fatalf("write crawler script: %v", err) + } for _, d := range []*catalog.Drive{ { @@ -862,7 +870,7 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) { Credentials: map[string]string{ "last_crawl_at": "1800000000", "proxy": " http://127.0.0.1:7890 ", - "script_path": "/opt/video-site-91/91VideoSpider/spider_91porn.py", + "script_path": scriptPath, }, Status: "ok", }, @@ -875,7 +883,7 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) { "builtin": "spider91", "last_crawl_at": "1800000000", "proxy": " http://127.0.0.1:7890 ", - "script_path": "/opt/video-site-91/91VideoSpider/spider_91porn.py", + "script_path": scriptPath, }, Status: "ok", }, @@ -889,6 +897,14 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) { }, Status: "ok", }, + { + ID: "crawler-script-deleted", + Kind: "scriptcrawler", + Name: "Deleted Script", + RootID: "/", + Credentials: map[string]string{}, + Status: "disconnected", + }, } { if err := cat.UpsertDrive(ctx, d); err != nil { t.Fatalf("seed drive %s: %v", d.ID, err) @@ -905,8 +921,8 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) { var got []struct { ID string `json:"id"` + Name string `json:"name"` Kind string `json:"kind"` - Builtin string `json:"builtin"` Proxy string `json:"proxy"` LastCrawlAt int64 `json:"lastCrawlAt"` } @@ -914,24 +930,30 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) { t.Fatalf("decode: %v", err) } byID := map[string]struct { + Name string Kind string - Builtin string Proxy string LastCrawlAt int64 }{} for _, d := range got { byID[d.ID] = struct { + Name string Kind string - Builtin string Proxy string LastCrawlAt int64 - }{Kind: d.Kind, Builtin: d.Builtin, Proxy: d.Proxy, LastCrawlAt: d.LastCrawlAt} + }{Name: d.Name, Kind: d.Kind, Proxy: d.Proxy, LastCrawlAt: d.LastCrawlAt} } if _, ok := byID["spider91-main"]; ok { t.Fatal("legacy spider91 drive should not be returned by crawler list") } - if byID["crawler-spider91"].Kind != "scriptcrawler" || byID["crawler-spider91"].Builtin != "spider91" { - t.Fatalf("crawler kind/builtin = %q/%q, want scriptcrawler/spider91", byID["crawler-spider91"].Kind, byID["crawler-spider91"].Builtin) + if _, ok := byID["crawler-script-deleted"]; ok { + t.Fatal("crawler without script_path should not be returned by crawler list") + } + if byID["crawler-spider91"].Kind != "scriptcrawler" { + t.Fatalf("crawler kind = %q, want scriptcrawler", byID["crawler-spider91"].Kind) + } + if byID["crawler-spider91"].Name != "91Porn" { + t.Fatalf("crawler name = %q, want script metadata name", byID["crawler-spider91"].Name) } if byID["crawler-spider91"].Proxy != "http://127.0.0.1:7890" { t.Fatalf("crawler proxy = %q, want trimmed proxy", byID["crawler-spider91"].Proxy) @@ -967,9 +989,10 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) { } } -func TestHandleUpsertCrawlerAllowsBuiltinSpider91WithoutScriptPath(t *testing.T) { +func TestHandleUpsertCrawlerRequiresScriptPath(t *testing.T) { ctx := context.Background() - cat, err := catalog.Open(t.TempDir() + "/catalog.db") + tmp := t.TempDir() + cat, err := catalog.Open(filepath.Join(tmp, "catalog.db")) if err != nil { t.Fatalf("open catalog: %v", err) } @@ -979,21 +1002,35 @@ func TestHandleUpsertCrawlerAllowsBuiltinSpider91WithoutScriptPath(t *testing.T) } }) + srv := &AdminServer{Catalog: cat} + scriptPath := filepath.Join(tmp, "custom.py") + if err := os.WriteFile(scriptPath, []byte("CRAWLER_NAME = \"91 Spider\"\n"), 0o644); err != nil { + t.Fatalf("write crawler script: %v", err) + } + + // 不再内置任何爬虫:没有脚本路径的保存请求必须被拒绝, + // 旧的 builtin 字段也不再有"免脚本"特权。 req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers", strings.NewReader(`{ "id": "spider91-main", - "name": "91 Spider", "builtin": "spider91", "scriptPath": "", - "pythonPath": "python3", "targetNew": "15" }`)) rr := httptest.NewRecorder() - (&AdminServer{ - Catalog: cat, - DefaultSpider91ScriptPath: func() string { - return "" - }, - }).handleUpsertCrawler(rr, req) + srv.handleUpsertCrawler(rr, req) + if rr.Code != http.StatusBadRequest { + t.Fatalf("status = %d, body = %s, want 400", rr.Code, rr.Body.String()) + } + + // 带脚本路径时正常保存,且请求中的 builtin 字段被忽略,不会写入凭证。 + req = httptest.NewRequest(http.MethodPost, "/admin/api/crawlers", strings.NewReader(`{ + "id": "spider91-main", + "builtin": "spider91", + "scriptPath": "`+scriptPath+`", + "targetNew": "15" + }`)) + rr = httptest.NewRecorder() + srv.handleUpsertCrawler(rr, req) if rr.Code != http.StatusOK { t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String()) } @@ -1002,23 +1039,85 @@ func TestHandleUpsertCrawlerAllowsBuiltinSpider91WithoutScriptPath(t *testing.T) if err != nil { t.Fatalf("get crawler drive: %v", err) } - if got.Kind != "scriptcrawler" || got.Credentials["builtin"] != "spider91" { - t.Fatalf("kind/builtin = %q/%q, want scriptcrawler/spider91", got.Kind, got.Credentials["builtin"]) + if got.Kind != "scriptcrawler" || got.Credentials["builtin"] != "" { + t.Fatalf("kind/builtin = %q/%q, want scriptcrawler with no builtin credential", got.Kind, got.Credentials["builtin"]) } - if got.Credentials["script_path"] != "" { - t.Fatalf("script_path = %q, want empty when default is unavailable", got.Credentials["script_path"]) + if got.Credentials["python_path"] != "" || got.Credentials["config_json"] != "" { + t.Fatalf("legacy hidden credentials should not be saved: %+v", got.Credentials) + } + if got.Name != "91 Spider" { + t.Fatalf("name = %q, want script metadata name", got.Name) + } + if got.Credentials["script_path"] != scriptPath { + t.Fatalf("script_path = %q, want %q", got.Credentials["script_path"], scriptPath) + } +} + +func TestHandleUpsertCrawlerGeneratesIDFromScriptName(t *testing.T) { + ctx := context.Background() + tmp := t.TempDir() + cat, err := catalog.Open(filepath.Join(tmp, "catalog.db")) + if err != nil { + t.Fatalf("open catalog: %v", err) + } + t.Cleanup(func() { + if err := cat.Close(); err != nil { + t.Fatalf("close catalog: %v", err) + } + }) + if err := cat.UpsertDrive(ctx, &catalog.Drive{ + ID: "crawler-my-spider", + Kind: scriptcrawler.Kind, + Name: "Existing", + RootID: "/", + Credentials: map[string]string{"script_path": "/opt/crawlers/existing.py"}, + }); err != nil { + t.Fatalf("seed crawler: %v", err) + } + scriptPath := filepath.Join(tmp, "custom.py") + if err := os.WriteFile(scriptPath, []byte("CRAWLER_NAME = \"My Spider\"\n"), 0o644); err != nil { + t.Fatalf("write crawler script: %v", err) + } + + req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers", strings.NewReader(`{ + "scriptPath": "`+scriptPath+`", + "targetNew": "15" + }`)) + rr := httptest.NewRecorder() + (&AdminServer{Catalog: cat}).handleUpsertCrawler(rr, req) + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String()) + } + + var resp struct { + OK bool `json:"ok"` + ID string `json:"id"` + } + if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil { + t.Fatalf("decode response: %v", err) + } + if !resp.OK || resp.ID != "crawler-my-spider-2" { + t.Fatalf("response = %+v, want generated suffix id", resp) + } + got, err := cat.GetDrive(ctx, resp.ID) + if err != nil { + t.Fatalf("get generated crawler: %v", err) + } + if got.Name != "My Spider" || got.Kind != scriptcrawler.Kind { + t.Fatalf("generated crawler = %+v", got) } } func TestHandleImportCrawlerScriptFile(t *testing.T) { tmp := t.TempDir() + script := "CRAWLER_NAME = \"Demo Crawler\"\nprint('crawler')\n" var body bytes.Buffer mw := multipart.NewWriter(&body) part, err := mw.CreateFormFile("file", "../demo crawler.py") if err != nil { t.Fatalf("create form file: %v", err) } - if _, err := part.Write([]byte("print('crawler')\n")); err != nil { + if _, err := part.Write([]byte(script)); err != nil { t.Fatalf("write part: %v", err) } if err := mw.Close(); err != nil { @@ -1034,6 +1133,7 @@ func TestHandleImportCrawlerScriptFile(t *testing.T) { } var got struct { ScriptPath string `json:"scriptPath"` + Name string `json:"name"` } if err := json.NewDecoder(rr.Body).Decode(&got); err != nil { t.Fatalf("decode: %v", err) @@ -1045,15 +1145,48 @@ func TestHandleImportCrawlerScriptFile(t *testing.T) { if filepath.Ext(got.ScriptPath) != ".py" { t.Fatalf("script path = %q, want .py", got.ScriptPath) } + if filepath.Base(got.ScriptPath) != "demo_crawler.py" { + t.Fatalf("script filename = %q, want original sanitized filename", filepath.Base(got.ScriptPath)) + } data, err := os.ReadFile(got.ScriptPath) if err != nil { t.Fatalf("read imported script: %v", err) } - if string(data) != "print('crawler')\n" { + if got.Name != "Demo Crawler" { + t.Fatalf("name = %q, want script metadata name", got.Name) + } + if string(data) != script { t.Fatalf("script content = %q", string(data)) } } +func TestHandleImportCrawlerScriptFileRejectsMissingName(t *testing.T) { + tmp := t.TempDir() + var body bytes.Buffer + mw := multipart.NewWriter(&body) + part, err := mw.CreateFormFile("file", "crawler.py") + if err != nil { + t.Fatalf("create form file: %v", err) + } + if _, err := part.Write([]byte("print('crawler')\n")); err != nil { + t.Fatalf("write part: %v", err) + } + if err := mw.Close(); err != nil { + t.Fatalf("close multipart: %v", err) + } + + req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/import-file", &body) + req.Header.Set("Content-Type", mw.FormDataContentType()) + rr := httptest.NewRecorder() + (&AdminServer{LocalPreviewDir: filepath.Join(tmp, "previews")}).handleImportCrawlerScriptFile(rr, req) + if rr.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want 400; body = %s", rr.Code, rr.Body.String()) + } + if !strings.Contains(rr.Body.String(), "CRAWLER_NAME") { + t.Fatalf("body = %s, want CRAWLER_NAME error", rr.Body.String()) + } +} + func TestHandleImportCrawlerScriptFileRejectsNonPython(t *testing.T) { tmp := t.TempDir() var body bytes.Buffer @@ -1088,7 +1221,7 @@ func TestHandleImportCrawlerScriptURL(t *testing.T) { http.NotFound(w, r) return } - _, _ = w.Write([]byte("# crawler from url\n")) + _, _ = w.Write([]byte("CRAWLER_NAME = \"URL Crawler\"\n# crawler from url\n")) })) defer upstream.Close() @@ -1102,6 +1235,7 @@ func TestHandleImportCrawlerScriptURL(t *testing.T) { } var got struct { ScriptPath string `json:"scriptPath"` + Name string `json:"name"` } if err := json.NewDecoder(rr.Body).Decode(&got); err != nil { t.Fatalf("decode: %v", err) @@ -1114,11 +1248,116 @@ func TestHandleImportCrawlerScriptURL(t *testing.T) { if err != nil { t.Fatalf("read imported script: %v", err) } - if string(data) != "# crawler from url\n" { + if got.Name != "URL Crawler" { + t.Fatalf("name = %q, want script metadata name", got.Name) + } + if filepath.Base(got.ScriptPath) != "crawler.py" { + t.Fatalf("script filename = %q, want original filename", filepath.Base(got.ScriptPath)) + } + if string(data) != "CRAWLER_NAME = \"URL Crawler\"\n# crawler from url\n" { t.Fatalf("script content = %q", string(data)) } } +func TestHandleDeleteCrawlerRemovesImportedScript(t *testing.T) { + ctx := context.Background() + tmp := t.TempDir() + cat, err := catalog.Open(filepath.Join(tmp, "catalog.db")) + if err != nil { + t.Fatalf("open catalog: %v", err) + } + t.Cleanup(func() { + if err := cat.Close(); err != nil { + t.Fatalf("close catalog: %v", err) + } + }) + + scriptDir := filepath.Join(tmp, "crawler-scripts") + if err := os.MkdirAll(scriptDir, 0o755); err != nil { + t.Fatalf("mkdir script dir: %v", err) + } + scriptPath := filepath.Join(scriptDir, "crawler.py") + if err := os.WriteFile(scriptPath, []byte("CRAWLER_NAME = \"Delete Me\"\n"), 0o644); err != nil { + t.Fatalf("write script: %v", err) + } + if err := cat.UpsertDrive(ctx, &catalog.Drive{ + ID: "crawler-delete-me", + Kind: scriptcrawler.Kind, + Name: "Delete Me", + RootID: "/", + Credentials: map[string]string{ + "script_path": scriptPath, + "proxy": "http://127.0.0.1:7890", + "target_new": "10", + }, + }); err != nil { + t.Fatalf("seed crawler: %v", err) + } + now := time.Now() + if err := cat.UpsertVideo(ctx, &catalog.Video{ + ID: "video-from-crawler", + DriveID: "crawler-delete-me", + FileID: "video.mp4", + Title: "Keep Me", + PublishedAt: now, + CreatedAt: now, + UpdatedAt: now, + }); err != nil { + t.Fatalf("seed video: %v", err) + } + + req := httptest.NewRequest(http.MethodDelete, "/admin/api/crawlers/crawler-delete-me", nil) + rctx := chi.NewRouteContext() + rctx.URLParams.Add("id", "crawler-delete-me") + req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx)) + rr := httptest.NewRecorder() + + stopped := false + (&AdminServer{ + Catalog: cat, + LocalPreviewDir: filepath.Join(tmp, "previews"), + OnDriveDeleteCleanup: func(context.Context, string) (int, error) { + t.Fatal("crawler delete must not delete imported videos") + return 0, nil + }, + OnStopDriveTasks: func(driveID string) bool { + stopped = driveID == "crawler-delete-me" + return true + }, + }).handleDeleteCrawler(rr, req) + + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String()) + } + if _, err := os.Stat(scriptPath); !errors.Is(err, os.ErrNotExist) { + t.Fatalf("script stat error = %v, want not exist", err) + } + if !stopped { + t.Fatal("stop hook was not called") + } + drive, err := cat.GetDrive(ctx, "crawler-delete-me") + if err != nil { + t.Fatalf("crawler drive should remain for existing videos: %v", err) + } + if drive.Credentials["script_path"] != "" || drive.Credentials["proxy"] != "" || drive.Credentials["target_new"] != "" { + t.Fatalf("crawler credentials were not cleared: %+v", drive.Credentials) + } + if _, err := cat.GetVideo(ctx, "video-from-crawler"); err != nil { + t.Fatalf("imported video should remain: %v", err) + } + var got struct { + OK bool `json:"ok"` + DeletedVideos int `json:"deletedVideos"` + DeletedScript bool `json:"deletedScript"` + } + if err := json.NewDecoder(rr.Body).Decode(&got); err != nil { + t.Fatalf("decode: %v", err) + } + if !got.OK || got.DeletedVideos != 0 || !got.DeletedScript { + t.Fatalf("response = %#v", got) + } +} + func TestHandleImportCrawlerScriptURLRejectsNonPython(t *testing.T) { tmp := t.TempDir() upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -1143,6 +1382,81 @@ func TestHandleImportCrawlerScriptURLRejectsNonPython(t *testing.T) { } } +func TestHandleTestCrawlerScriptRunsImportedScript(t *testing.T) { + if _, err := exec.LookPath("python3"); err != nil { + t.Skip("python3 is required for crawler script dry-run") + } + media := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/video.mp4" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "video/mp4") + if r.Header.Get("Range") == "bytes=0-0" { + w.Header().Set("Content-Range", "bytes 0-0/2048") + w.WriteHeader(http.StatusPartialContent) + _, _ = w.Write([]byte{0}) + return + } + _, _ = w.Write([]byte("video")) + })) + defer media.Close() + + script := filepath.Join(t.TempDir(), "crawler.py") + body := `import json +print(json.dumps({"title": "Dry Run Video", "source_id": "dry-1", "media_url": "` + media.URL + `/video.mp4", "thumbnail_url": "` + media.URL + `/thumb.jpg", "detail_url": "` + media.URL + `/detail"})) +` + if err := os.WriteFile(script, []byte(body), 0o755); err != nil { + t.Fatalf("write script: %v", err) + } + + reqBody, err := json.Marshal(map[string]string{ + "scriptPath": script, + }) + if err != nil { + t.Fatalf("marshal request: %v", err) + } + req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/test-script", bytes.NewReader(reqBody)) + rr := httptest.NewRecorder() + (&AdminServer{}).handleTestCrawlerScript(rr, req) + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String()) + } + + var got struct { + OK bool `json:"ok"` + Items []struct { + Title string `json:"title"` + SourceID string `json:"sourceId"` + MediaURL string `json:"mediaUrl"` + } `json:"items"` + MediaCheck *struct { + OK bool `json:"ok"` + Status int `json:"status"` + ContentType string `json:"contentType"` + ContentLength int64 `json:"contentLengthBytes"` + } `json:"mediaCheck"` + } + if err := json.NewDecoder(rr.Body).Decode(&got); err != nil { + t.Fatalf("decode: %v", err) + } + if !got.OK { + t.Fatalf("ok = false, body = %s", rr.Body.String()) + } + if len(got.Items) != 1 || got.Items[0].Title != "Dry Run Video" || got.Items[0].SourceID != "dry-1" { + t.Fatalf("items = %#v", got.Items) + } + if got.Items[0].MediaURL != media.URL+"/video.mp4" { + t.Fatalf("mediaUrl = %q", got.Items[0].MediaURL) + } + if got.MediaCheck == nil || !got.MediaCheck.OK || got.MediaCheck.Status != http.StatusPartialContent { + t.Fatalf("mediaCheck = %#v", got.MediaCheck) + } + if got.MediaCheck.ContentLength != 2048 { + t.Fatalf("contentLength = %d, want 2048", got.MediaCheck.ContentLength) + } +} + func TestHandleListDrivesIncludesGoogleDriveOnlineAPIMode(t *testing.T) { ctx := context.Background() cat, err := catalog.Open(t.TempDir() + "/catalog.db") diff --git a/backend/internal/drives/scriptcrawler/crawler.go b/backend/internal/drives/scriptcrawler/crawler.go index 994cbd4..ba5efa4 100644 --- a/backend/internal/drives/scriptcrawler/crawler.go +++ b/backend/internal/drives/scriptcrawler/crawler.go @@ -273,12 +273,16 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err } emit(CrawlProgress{}) - if err := os.MkdirAll(c.cfg.Driver.CrawlDir(), 0o755); err != nil { + crawlDir, err := filepath.Abs(c.cfg.Driver.CrawlDir()) + if err != nil { + return result, fmt.Errorf("scriptcrawler: resolve crawl dir: %w", err) + } + if err := os.MkdirAll(crawlDir, 0o755); err != nil { return result, err } runID := time.Now().UTC().Format("20060102T150405Z") - seenPath := filepath.Join(c.cfg.Driver.CrawlDir(), "seen-"+runID+".txt") - jobPath := filepath.Join(c.cfg.Driver.CrawlDir(), "job-"+runID+".json") + seenPath := filepath.Join(crawlDir, "seen-"+runID+".txt") + jobPath := filepath.Join(crawlDir, "job-"+runID+".json") result.SeenFile = seenPath result.JobFile = jobPath @@ -412,6 +416,10 @@ func (c *Crawler) writeJobFile(path, runID string, targetNew int, seenPath strin } cfg = json.RawMessage(raw) } + outputDir, err := filepath.Abs(c.cfg.Driver.OutputDir()) + if err != nil { + return fmt.Errorf("resolve output dir: %w", err) + } job := Job{ Protocol: "crawler.v1", Mode: "crawl", @@ -419,7 +427,7 @@ func (c *Crawler) writeJobFile(path, runID string, targetNew int, seenPath strin CrawlerID: c.cfg.Driver.ID(), TargetNew: targetNew, SeenSourceIDsFile: seenPath, - OutputDir: c.cfg.Driver.OutputDir(), + OutputDir: outputDir, Config: cfg, Network: JobNetwork{ProxyURL: strings.TrimSpace(c.cfg.ProxyURL)}, } diff --git a/backend/internal/drives/scriptcrawler/crawler_test.go b/backend/internal/drives/scriptcrawler/crawler_test.go index 817e340..440e286 100644 --- a/backend/internal/drives/scriptcrawler/crawler_test.go +++ b/backend/internal/drives/scriptcrawler/crawler_test.go @@ -135,6 +135,58 @@ func TestCrawlerRunOnceUsesSourceKindNamespace(t *testing.T) { } } +func TestCrawlerRunOncePassesAbsoluteJobPathsWhenWorkDirDiffers(t *testing.T) { + ctx := context.Background() + tmp := t.TempDir() + t.Chdir(tmp) + cat, err := catalog.Open(filepath.Join(tmp, "catalog.db")) + if err != nil { + t.Fatalf("open catalog: %v", err) + } + t.Cleanup(func() { + if err := cat.Close(); err != nil { + t.Fatalf("close catalog: %v", err) + } + }) + drv := New(Config{ID: "demo", RootDir: filepath.Join("data", "crawler")}) + if err := drv.Init(ctx); err != nil { + t.Fatalf("driver init: %v", err) + } + scriptDir := filepath.Join(tmp, "scripts") + if err := os.MkdirAll(scriptDir, 0o755); err != nil { + t.Fatalf("mkdir script dir: %v", err) + } + dummyScript := filepath.Join(scriptDir, "helper-script") + if err := os.WriteFile(dummyScript, []byte("helper"), 0o755); err != nil { + t.Fatalf("write dummy script: %v", err) + } + wrapper := filepath.Join(tmp, "helper-wrapper.sh") + wrapperScript := fmt.Sprintf("#!/bin/sh\nexec %q -test.run=TestScriptCrawlerHelperProcess \"$@\"\n", os.Args[0]) + if err := os.WriteFile(wrapper, []byte(wrapperScript), 0o755); err != nil { + t.Fatalf("write helper wrapper: %v", err) + } + + t.Setenv("GO_WANT_SCRIPTCRAWLER_HELPER", "1") + t.Setenv("GO_WANT_SCRIPTCRAWLER_ASSERT_ABS", "1") + c := NewCrawler(CrawlerConfig{ + Driver: drv, + Catalog: cat, + PythonPath: wrapper, + ScriptPath: dummyScript, + WorkDir: scriptDir, + }) + res, err := c.RunOnce(ctx, 1) + if err != nil { + t.Fatalf("run once: %v", err) + } + if res.NewVideos != 1 || res.Skipped != 0 || res.Failed != 0 { + t.Fatalf("result = new:%d skipped:%d failed:%d, want 1/0/0", res.NewVideos, res.Skipped, res.Failed) + } + if !filepath.IsAbs(res.JobFile) || !filepath.IsAbs(res.SeenFile) { + t.Fatalf("result paths should be absolute: job=%q seen=%q", res.JobFile, res.SeenFile) + } +} + func TestCrawlerRunOnceImportsSimpleMediaURLWithoutSourceID(t *testing.T) { ctx := context.Background() tmp := t.TempDir() @@ -241,6 +293,12 @@ func TestScriptCrawlerHelperProcess(t *testing.T) { fmt.Fprintln(os.Stderr, err) os.Exit(2) } + if os.Getenv("GO_WANT_SCRIPTCRAWLER_ASSERT_ABS") == "1" { + if !filepath.IsAbs(jobPath) || !filepath.IsAbs(job.SeenSourceIDsFile) || !filepath.IsAbs(job.OutputDir) { + fmt.Fprintf(os.Stderr, "expected absolute paths, got job=%q seen=%q output=%q\n", jobPath, job.SeenSourceIDsFile, job.OutputDir) + os.Exit(2) + } + } if os.Getenv("GO_WANT_SCRIPTCRAWLER_SIMPLE") == "1" { event := map[string]any{ "title": "Simple Protocol Video", diff --git a/backend/internal/drives/scriptcrawler/dryrun.go b/backend/internal/drives/scriptcrawler/dryrun.go new file mode 100644 index 0000000..fd9cff9 --- /dev/null +++ b/backend/internal/drives/scriptcrawler/dryrun.go @@ -0,0 +1,375 @@ +package scriptcrawler + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "sync" + "syscall" + "time" +) + +// DryRun 在不入库的前提下试跑一个爬虫脚本:临时目录里生成 job.json, +// 启动脚本进程,拿到第一条(或前 MaxItems 条)item 事件后立即停止, +// 再对视频直链做一次小范围探测,验证脚本"能不能爬取到视频"。 +// 用于后台导入脚本后的"测试脚本"按钮。 + +const ( + defaultDryRunTimeout = 2 * time.Minute + dryRunLogTailLines = 60 + dryRunMediaProbeLimit = 20 * time.Second +) + +type DryRunConfig struct { + PythonPath string + ScriptPath string + ProxyURL string + ConfigJSON string + // MaxItems 收到多少条 item 后停止脚本,默认 1。 + MaxItems int + // Timeout 整个试跑的硬上限,默认 2 分钟。 + Timeout time.Duration + // SkipMediaProbe 跳过视频直链可达性探测(单测注入用)。 + SkipMediaProbe bool + HTTPClient *http.Client +} + +type DryRunItem struct { + Title string `json:"title"` + SourceID string `json:"sourceId,omitempty"` + MediaURL string `json:"mediaUrl,omitempty"` + MediaLocalFile string `json:"mediaLocalFile,omitempty"` + ThumbnailURL string `json:"thumbnailUrl,omitempty"` + DetailURL string `json:"detailUrl,omitempty"` +} + +type DryRunMediaCheck struct { + OK bool `json:"ok"` + Status int `json:"status,omitempty"` + ContentType string `json:"contentType,omitempty"` + ContentLength int64 `json:"contentLengthBytes,omitempty"` + Error string `json:"error,omitempty"` +} + +type DryRunResult struct { + OK bool `json:"ok"` + Items []DryRunItem `json:"items"` + MediaCheck *DryRunMediaCheck `json:"mediaCheck,omitempty"` + Error string `json:"error,omitempty"` + Log []string `json:"log,omitempty"` + DurationMs int64 `json:"durationMs"` +} + +func DryRun(ctx context.Context, cfg DryRunConfig) *DryRunResult { + started := time.Now() + result := &DryRunResult{Items: []DryRunItem{}} + defer func() { result.DurationMs = time.Since(started).Milliseconds() }() + + scriptPath := strings.TrimSpace(cfg.ScriptPath) + if scriptPath == "" { + result.Error = "脚本路径为空,请先导入脚本" + return result + } + if _, err := os.Stat(scriptPath); err != nil { + result.Error = fmt.Sprintf("脚本不存在: %v", err) + return result + } + pythonPath := strings.TrimSpace(cfg.PythonPath) + if pythonPath == "" { + pythonPath = "python3" + } + maxItems := cfg.MaxItems + if maxItems <= 0 { + maxItems = 1 + } + timeout := cfg.Timeout + if timeout <= 0 { + timeout = defaultDryRunTimeout + } + + tmpDir, err := os.MkdirTemp("", "crawler-dryrun-") + if err != nil { + result.Error = fmt.Sprintf("创建临时目录失败: %v", err) + return result + } + defer os.RemoveAll(tmpDir) + + outputDir := filepath.Join(tmpDir, "output") + if err := os.MkdirAll(outputDir, 0o755); err != nil { + result.Error = fmt.Sprintf("创建输出目录失败: %v", err) + return result + } + seenPath := filepath.Join(tmpDir, "seen.txt") + if err := os.WriteFile(seenPath, nil, 0o644); err != nil { + result.Error = fmt.Sprintf("写入 seen 文件失败: %v", err) + return result + } + + configJSON := json.RawMessage([]byte("{}")) + if raw := strings.TrimSpace(cfg.ConfigJSON); raw != "" { + if !json.Valid([]byte(raw)) { + result.Error = "自定义配置必须是合法 JSON" + return result + } + configJSON = json.RawMessage(raw) + } + job := Job{ + Protocol: "crawler.v1", + Mode: "crawl", + RunID: "dryrun-" + started.UTC().Format("20060102T150405Z"), + CrawlerID: "dryrun", + TargetNew: maxItems, + SeenSourceIDsFile: seenPath, + OutputDir: outputDir, + Config: configJSON, + Network: JobNetwork{ProxyURL: strings.TrimSpace(cfg.ProxyURL)}, + } + jobPath := filepath.Join(tmpDir, "job.json") + jobData, err := json.MarshalIndent(job, "", " ") + if err != nil { + result.Error = fmt.Sprintf("生成 job 文件失败: %v", err) + return result + } + if err := os.WriteFile(jobPath, jobData, 0o600); err != nil { + result.Error = fmt.Sprintf("写入 job 文件失败: %v", err) + return result + } + + runCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + cmd := exec.CommandContext(runCtx, pythonPath, scriptPath, "--job", jobPath) + cmd.Dir = filepath.Dir(scriptPath) + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + cmd.Cancel = func() error { + return killDryRunProcess(cmd) + } + // 超时或提前 kill 后,脚本派生的子进程可能仍持有 stdout/stderr 管道; + // WaitDelay 强制在宽限期后关闭管道,避免读取端永久阻塞。 + cmd.WaitDelay = 3 * time.Second + if proxyURL := strings.TrimSpace(cfg.ProxyURL); proxyURL != "" { + cmd.Env = append(os.Environ(), + "HTTP_PROXY="+proxyURL, + "HTTPS_PROXY="+proxyURL, + "http_proxy="+proxyURL, + "https_proxy="+proxyURL, + "NO_PROXY=", + "no_proxy=", + ) + } + stdout, err := cmd.StdoutPipe() + if err != nil { + result.Error = fmt.Sprintf("启动脚本失败: %v", err) + return result + } + stderr, err := cmd.StderrPipe() + if err != nil { + _ = stdout.Close() + result.Error = fmt.Sprintf("启动脚本失败: %v", err) + return result + } + if err := cmd.Start(); err != nil { + _ = stdout.Close() + _ = stderr.Close() + result.Error = fmt.Sprintf("启动脚本失败: %v", err) + return result + } + + // stderr 是脚本日志,保留尾部若干行用于排错回显。 + var logMu sync.Mutex + logTail := make([]string, 0, dryRunLogTailLines) + stderrDone := make(chan struct{}) + go func() { + defer close(stderrDone) + scanner := bufio.NewScanner(stderr) + scanner.Buffer(make([]byte, 64*1024), 1024*1024) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + logMu.Lock() + if len(logTail) >= dryRunLogTailLines { + logTail = logTail[1:] + } + logTail = append(logTail, line) + logMu.Unlock() + } + }() + + items := []DryRunItem{} + var firstMediaHeaders map[string]string + parseFailures := 0 + scanner := bufio.NewScanner(stdout) + scanner.Buffer(make([]byte, 64*1024), 4*1024*1024) + for scanner.Scan() { + if runCtx.Err() != nil { + break + } + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + var event Event + if err := json.Unmarshal([]byte(line), &event); err != nil { + parseFailures++ + continue + } + eventType := strings.ToLower(strings.TrimSpace(event.Type)) + item := event.normalizedItem() + if eventType == "" && item.hasPayload() { + eventType = "item" + } + if eventType != "item" { + continue + } + normalized, _, err := normalizeItemForImport(item) + if err != nil { + result.Error = fmt.Sprintf("item 字段不完整: %v", err) + continue + } + mediaURL := strings.TrimSpace(normalized.Media.URL) + if len(items) == 0 { + firstMediaHeaders = normalized.Media.Headers + } + items = append(items, DryRunItem{ + Title: strings.TrimSpace(normalized.Title), + SourceID: strings.TrimSpace(item.SourceID), + MediaURL: mediaURL, + MediaLocalFile: strings.TrimSpace(normalized.Media.LocalFile), + ThumbnailURL: strings.TrimSpace(normalized.Thumbnail.URL), + DetailURL: strings.TrimSpace(normalized.DetailURL), + }) + if len(items) >= maxItems { + break + } + } + // 拿够了就停掉脚本,避免它继续翻页。 + _ = killDryRunProcess(cmd) + _ = cmd.Wait() + <-stderrDone + + logMu.Lock() + result.Log = append([]string{}, logTail...) + logMu.Unlock() + result.Items = items + + if len(items) == 0 { + if result.Error == "" { + switch { + case runCtx.Err() != nil && ctx.Err() == nil: + result.Error = fmt.Sprintf("测试超时(%s),脚本没有输出任何视频", timeout) + case parseFailures > 0: + result.Error = "脚本 stdout 不是合法的 crawler.v1 JSON Lines(日志应输出到 stderr)" + default: + result.Error = "脚本退出但没有输出任何视频" + } + } + return result + } + result.Error = "" + + first := items[0] + switch { + case cfg.SkipMediaProbe: + result.OK = true + case first.MediaLocalFile != "": + // 脚本自己下载到 output_dir 的模式:试跑用的是临时目录, + // 文件已随目录清理,能输出合法 local_file 即视为通过。 + result.OK = true + default: + check := probeMediaURL(ctx, cfg, first, firstMediaHeaders) + result.MediaCheck = check + result.OK = check.OK + } + return result +} + +func killDryRunProcess(cmd *exec.Cmd) error { + if cmd == nil || cmd.Process == nil { + return nil + } + if err := syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL); err != nil { + if err == syscall.ESRCH { + return nil + } + return cmd.Process.Kill() + } + return nil +} + +// probeMediaURL 对视频直链发一个 Range: bytes=0-0 的小请求, +// 验证直链可达(带上脚本给的防盗链 headers 和代理)。 +func probeMediaURL(ctx context.Context, cfg DryRunConfig, item DryRunItem, mediaHeaders map[string]string) *DryRunMediaCheck { + check := &DryRunMediaCheck{} + if item.MediaURL == "" { + check.Error = "item 没有视频直链" + return check + } + + client := cfg.HTTPClient + if client == nil { + transport := &http.Transport{ + Proxy: http.ProxyFromEnvironment, + ResponseHeaderTimeout: dryRunMediaProbeLimit, + } + if err := configureExplicitProxy(transport, cfg.ProxyURL); err != nil { + check.Error = fmt.Sprintf("代理配置无效: %v", err) + return check + } + client = &http.Client{Transport: transport} + } + + probeCtx, cancel := context.WithTimeout(ctx, dryRunMediaProbeLimit) + defer cancel() + req, err := http.NewRequestWithContext(probeCtx, http.MethodGet, item.MediaURL, nil) + if err != nil { + check.Error = fmt.Sprintf("视频直链无效: %v", err) + return check + } + req.Header.Set("User-Agent", defaultUserAgent) + req.Header.Set("Range", "bytes=0-0") + if item.DetailURL != "" { + req.Header.Set("Referer", item.DetailURL) + } + for k, v := range mediaHeaders { + k = strings.TrimSpace(k) + if k == "" { + continue + } + req.Header.Set(k, v) + } + resp, err := client.Do(req) + if err != nil { + check.Error = fmt.Sprintf("视频直链请求失败: %v", err) + return check + } + defer resp.Body.Close() + + check.Status = resp.StatusCode + check.ContentType = resp.Header.Get("Content-Type") + if cr := resp.Header.Get("Content-Range"); cr != "" { + // Content-Range: bytes 0-0/12345 → 取总大小 + if idx := strings.LastIndex(cr, "/"); idx >= 0 { + var total int64 + if _, err := fmt.Sscanf(cr[idx+1:], "%d", &total); err == nil { + check.ContentLength = total + } + } + } + if check.ContentLength == 0 && resp.StatusCode == http.StatusOK { + check.ContentLength = resp.ContentLength + } + if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusPartialContent { + check.Error = fmt.Sprintf("视频直链返回 HTTP %d", resp.StatusCode) + return check + } + check.OK = true + return check +} diff --git a/backend/internal/drives/scriptcrawler/dryrun_test.go b/backend/internal/drives/scriptcrawler/dryrun_test.go new file mode 100644 index 0000000..7852132 --- /dev/null +++ b/backend/internal/drives/scriptcrawler/dryrun_test.go @@ -0,0 +1,153 @@ +package scriptcrawler + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" + "time" +) + +func writeDryRunScript(t *testing.T, body string) string { + t.Helper() + dir := t.TempDir() + path := filepath.Join(dir, "crawler.sh") + if err := os.WriteFile(path, []byte("#!/bin/sh\n"+body), 0o755); err != nil { + t.Fatalf("write script: %v", err) + } + return path +} + +func TestDryRunCollectsFirstItem(t *testing.T) { + script := writeDryRunScript(t, ` +echo '[log] fetching list page' >&2 +echo '{"type":"item","item":{"title":"Test Video","media_url":"https://cdn.example.test/v.mp4","source_id":"123","thumbnail_url":"https://cdn.example.test/t.jpg"}}' +echo '{"type":"done","stats":{"emitted":1}}' +`) + result := DryRun(context.Background(), DryRunConfig{ + PythonPath: "/bin/sh", + ScriptPath: script, + SkipMediaProbe: true, + }) + if !result.OK { + t.Fatalf("ok = false, error = %q, log = %v", result.Error, result.Log) + } + if len(result.Items) != 1 { + t.Fatalf("items = %d, want 1", len(result.Items)) + } + item := result.Items[0] + if item.Title != "Test Video" || item.MediaURL != "https://cdn.example.test/v.mp4" || item.SourceID != "123" { + t.Fatalf("item = %+v", item) + } + if len(result.Log) == 0 || !strings.Contains(result.Log[0], "fetching list page") { + t.Fatalf("log tail = %v, want stderr captured", result.Log) + } +} + +func TestDryRunProbesMediaURL(t *testing.T) { + var gotRange, gotReferer string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotRange = r.Header.Get("Range") + gotReferer = r.Header.Get("Referer") + w.Header().Set("Content-Type", "video/mp4") + w.Header().Set("Content-Range", "bytes 0-0/4096") + w.WriteHeader(http.StatusPartialContent) + _, _ = w.Write([]byte("x")) + })) + t.Cleanup(srv.Close) + + script := writeDryRunScript(t, fmt.Sprintf( + `echo '{"type":"item","title":"Probe Video","media_url":"%s/v.mp4","detail_url":"https://example.test/view"}'`, + srv.URL, + )) + result := DryRun(context.Background(), DryRunConfig{ + PythonPath: "/bin/sh", + ScriptPath: script, + }) + if !result.OK { + t.Fatalf("ok = false, error = %q, mediaCheck = %+v", result.Error, result.MediaCheck) + } + if result.MediaCheck == nil || !result.MediaCheck.OK { + t.Fatalf("mediaCheck = %+v, want ok", result.MediaCheck) + } + if result.MediaCheck.Status != http.StatusPartialContent || result.MediaCheck.ContentLength != 4096 { + t.Fatalf("mediaCheck = %+v, want 206 with total 4096", result.MediaCheck) + } + if gotRange != "bytes=0-0" || gotReferer != "https://example.test/view" { + t.Fatalf("probe headers range=%q referer=%q", gotRange, gotReferer) + } +} + +func TestDryRunReportsBrokenMediaURL(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "forbidden", http.StatusForbidden) + })) + t.Cleanup(srv.Close) + + script := writeDryRunScript(t, fmt.Sprintf( + `echo '{"type":"item","title":"Dead Link","media_url":"%s/v.mp4"}'`, + srv.URL, + )) + result := DryRun(context.Background(), DryRunConfig{ + PythonPath: "/bin/sh", + ScriptPath: script, + }) + if result.OK { + t.Fatal("ok = true, want false for HTTP 403 media url") + } + if result.MediaCheck == nil || result.MediaCheck.OK || result.MediaCheck.Status != http.StatusForbidden { + t.Fatalf("mediaCheck = %+v, want failed 403", result.MediaCheck) + } + if len(result.Items) != 1 { + t.Fatalf("items = %d, want item still returned for debugging", len(result.Items)) + } +} + +func TestDryRunRejectsNonJSONStdout(t *testing.T) { + script := writeDryRunScript(t, `echo 'plain text progress output'`) + result := DryRun(context.Background(), DryRunConfig{ + PythonPath: "/bin/sh", + ScriptPath: script, + SkipMediaProbe: true, + }) + if result.OK { + t.Fatal("ok = true, want false for non-JSON stdout") + } + if !strings.Contains(result.Error, "JSON Lines") { + t.Fatalf("error = %q, want JSON Lines hint", result.Error) + } +} + +func TestDryRunTimesOut(t *testing.T) { + script := writeDryRunScript(t, `sleep 30`) + start := time.Now() + result := DryRun(context.Background(), DryRunConfig{ + PythonPath: "/bin/sh", + ScriptPath: script, + Timeout: 2 * time.Second, + SkipMediaProbe: true, + }) + if result.OK { + t.Fatal("ok = true, want false on timeout") + } + if !strings.Contains(result.Error, "超时") { + t.Fatalf("error = %q, want timeout message", result.Error) + } + if elapsed := time.Since(start); elapsed > 10*time.Second { + t.Fatalf("dry run took %s, script was not killed", elapsed) + } +} + +func TestDryRunMissingScript(t *testing.T) { + result := DryRun(context.Background(), DryRunConfig{ + PythonPath: "/bin/sh", + ScriptPath: filepath.Join(t.TempDir(), "missing.py"), + }) + if result.OK || result.Error == "" { + t.Fatalf("result = %+v, want error for missing script", result) + } +} diff --git a/backend/internal/drives/scriptcrawler/metadata.go b/backend/internal/drives/scriptcrawler/metadata.go new file mode 100644 index 0000000..f79334d --- /dev/null +++ b/backend/internal/drives/scriptcrawler/metadata.go @@ -0,0 +1,117 @@ +package scriptcrawler + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strings" +) + +const maxCrawlerNameRunes = 80 + +type Metadata struct { + Name string `json:"name"` +} + +func ReadMetadata(scriptPath string) (Metadata, error) { + scriptPath = strings.TrimSpace(scriptPath) + if scriptPath == "" { + return Metadata{}, errors.New("脚本路径为空") + } + if filepath.Ext(scriptPath) != ".py" { + return Metadata{}, errors.New("目前只支持 .py 爬虫脚本") + } + data, err := os.ReadFile(scriptPath) + if err != nil { + return Metadata{}, fmt.Errorf("读取脚本失败: %w", err) + } + return ExtractMetadata(string(data)) +} + +func ExtractMetadata(source string) (Metadata, error) { + for _, line := range strings.Split(source, "\n") { + trimmed := strings.TrimSpace(line) + if trimmed == "" || strings.HasPrefix(trimmed, "#") { + continue + } + if !strings.HasPrefix(trimmed, "CRAWLER_NAME") { + continue + } + left, right, ok := strings.Cut(trimmed, "=") + if !ok || strings.TrimSpace(left) != "CRAWLER_NAME" { + continue + } + name, ok := parsePythonStringLiteral(right) + if !ok { + return Metadata{}, errors.New(`CRAWLER_NAME 必须是字符串字面量,例如 CRAWLER_NAME = "示例爬虫"`) + } + name = strings.TrimSpace(name) + if name == "" { + return Metadata{}, errors.New("CRAWLER_NAME 不能为空") + } + if len([]rune(name)) > maxCrawlerNameRunes { + return Metadata{}, fmt.Errorf("CRAWLER_NAME 不能超过 %d 个字符", maxCrawlerNameRunes) + } + return Metadata{Name: name}, nil + } + return Metadata{}, errors.New(`脚本必须声明 CRAWLER_NAME,例如 CRAWLER_NAME = "示例爬虫"`) +} + +func parsePythonStringLiteral(raw string) (string, bool) { + s := strings.TrimSpace(raw) + if s == "" { + return "", false + } + rawString := false + for len(s) > 0 { + switch s[0] { + case 'r', 'R': + rawString = true + s = strings.TrimSpace(s[1:]) + case 'u', 'U', 'b', 'B': + s = strings.TrimSpace(s[1:]) + default: + goto parseQuote + } + } + +parseQuote: + if len(s) < 2 || (s[0] != '"' && s[0] != '\'') { + return "", false + } + quote := s[0] + var b strings.Builder + escaped := false + for i := 1; i < len(s); i++ { + ch := s[i] + if escaped { + switch { + case rawString: + b.WriteByte('\\') + b.WriteByte(ch) + case ch == 'n': + b.WriteByte('\n') + case ch == 'r': + b.WriteByte('\r') + case ch == 't': + b.WriteByte('\t') + case ch == '\\' || ch == quote || ch == '"' || ch == '\'': + b.WriteByte(ch) + default: + b.WriteByte(ch) + } + escaped = false + continue + } + if ch == '\\' { + escaped = true + continue + } + if ch == quote { + return b.String(), true + } + b.WriteByte(ch) + } + return "", false +} diff --git a/backend/internal/drives/scriptcrawler/metadata_test.go b/backend/internal/drives/scriptcrawler/metadata_test.go new file mode 100644 index 0000000..5eb0bd8 --- /dev/null +++ b/backend/internal/drives/scriptcrawler/metadata_test.go @@ -0,0 +1,39 @@ +package scriptcrawler + +import ( + "strings" + "testing" +) + +func TestExtractMetadataReadsCrawlerName(t *testing.T) { + meta, err := ExtractMetadata(` +# comment +CRAWLER_NAME = "示例爬虫" +`) + if err != nil { + t.Fatalf("extract metadata: %v", err) + } + if meta.Name != "示例爬虫" { + t.Fatalf("name = %q", meta.Name) + } +} + +func TestExtractMetadataRejectsMissingCrawlerName(t *testing.T) { + _, err := ExtractMetadata(`print("hello")`) + if err == nil { + t.Fatal("expected error") + } + if !strings.Contains(err.Error(), "CRAWLER_NAME") { + t.Fatalf("error = %v, want CRAWLER_NAME guidance", err) + } +} + +func TestExtractMetadataRejectsEmptyCrawlerName(t *testing.T) { + _, err := ExtractMetadata(`CRAWLER_NAME = " "`) + if err == nil { + t.Fatal("expected error") + } + if !strings.Contains(err.Error(), "不能为空") { + t.Fatalf("error = %v, want empty-name error", err) + } +} diff --git a/deploy.sh b/deploy.sh index 84b822d..d938bcd 100755 --- a/deploy.sh +++ b/deploy.sh @@ -134,9 +134,9 @@ apt_install() { python3 python3-requests python3-bs4 python3-lxml python3-socks } -verify_spider91_python_deps() { - command -v python3 >/dev/null 2>&1 || die "python3 is required for 91Spider" - python3 - <<'PY' || die "missing Python modules for 91Spider: requests, bs4, lxml, socks" +verify_crawler_python_deps() { + command -v python3 >/dev/null 2>&1 || die "python3 is required for crawler scripts" + python3 - <<'PY' || die "missing Python modules for crawler scripts: requests, bs4, lxml, socks" import importlib.util import sys @@ -200,7 +200,7 @@ install_dependencies() { install_go command -v ffmpeg >/dev/null 2>&1 || die "ffmpeg is required" command -v ffprobe >/dev/null 2>&1 || die "ffprobe is required" - verify_spider91_python_deps + verify_crawler_python_deps } ensure_ownership() { diff --git a/docs/crawler-protocol.md b/docs/crawler-protocol.md index b2d3357..e6abd3b 100644 --- a/docs/crawler-protocol.md +++ b/docs/crawler-protocol.md @@ -6,6 +6,14 @@ task status and cancellation. ## Invocation +Every script must declare a static crawler name near the top of the Python file. +The admin page reads this value when importing the script; users do not type the +crawler name manually. + +```python +CRAWLER_NAME = "Example Crawler" +``` + The backend runs: ```bash @@ -35,12 +43,12 @@ python3 /path/to/crawler.py --job /path/to/job.json ## Importing Scripts Crawler scripts are configured from the admin crawler page. A script can be -entered as an existing server path, uploaded as a local file, or imported from -an HTTP(S) URL. +uploaded as a local file or imported from an HTTP(S) URL. Imported scripts are copied into `crawler-scripts/` next to the configured local preview data directory. The import API currently accepts Python files only -(`.py`) and rejects empty files or files larger than 2 MiB. +(`.py`) and rejects empty files, files larger than 2 MiB, or scripts without +`CRAWLER_NAME`. ## Output diff --git a/examples/crawlers/simple_crawler.py b/examples/crawlers/simple_crawler.py index 733b597..01671ac 100644 --- a/examples/crawlers/simple_crawler.py +++ b/examples/crawlers/simple_crawler.py @@ -3,6 +3,8 @@ import argparse import json import sys +CRAWLER_NAME = "Demo Crawler" + def load_seen(path): try: diff --git a/install.sh b/install.sh index 33d0668..0f07dc5 100755 --- a/install.sh +++ b/install.sh @@ -128,7 +128,7 @@ verify_runtime_deps() { command -v "$cmd" >/dev/null 2>&1 || die "missing command: $cmd" done - python3 - <<'PY' || die "missing Python modules for 91Spider: requests, bs4, lxml, socks" + python3 - <<'PY' || die "missing Python modules for crawler scripts: requests, bs4, lxml, socks" import importlib.util import sys diff --git a/scripts/build-release.sh b/scripts/build-release.sh index a90fa20..84d7343 100755 --- a/scripts/build-release.sh +++ b/scripts/build-release.sh @@ -63,8 +63,6 @@ build_package() { cp "$ROOT_DIR/backend/config.example.yaml" "$work/config.example.yaml" cp "$ROOT_DIR/install.sh" "$work/install.sh" cp -R "$ROOT_DIR/dist" "$work/dist" - mkdir -p "$work/91VideoSpider" - cp "$ROOT_DIR/91VideoSpider/spider_91porn.py" "$work/91VideoSpider/spider_91porn.py" cat >"$work/README.txt" <(null); const [mode, setMode] = useState<"list" | "detail">("list"); const { show } = useToast(); @@ -44,6 +57,15 @@ export function CrawlersPage() { () => list.find((item) => item.id === selectedId) ?? null, [list, selectedId] ); + const stats = useMemo(() => { + const running = list.filter((item) => item.scanGenerationStatus?.state === "scanning").length; + return { + total: list.length, + ready: list.filter((item) => item.status === "ok").length, + running, + error: list.filter((item) => item.status === "error").length, + }; + }, [list]); async function refresh() { setLoading(true); @@ -64,15 +86,13 @@ export function CrawlersPage() { function selectCrawler(crawler: api.AdminCrawler) { setSelectedId(crawler.id); setMode("detail"); + setTestResult(null); setForm({ id: crawler.id, name: crawler.name, - builtin: crawler.builtin ?? "", scriptPath: crawler.scriptPath ?? "", - pythonPath: crawler.pythonPath || "python3", - targetNew: crawler.targetNew || (crawler.builtin === "spider91" || crawler.kind === "spider91" ? "15" : "10"), + targetNew: crawler.targetNew || "10", proxy: crawler.proxy ?? "", - configJson: crawler.configJson ?? "", }); } @@ -80,20 +100,7 @@ export function CrawlersPage() { setSelectedId(""); setForm(emptyForm); setScriptURL(""); - setMode("detail"); - } - - function createSpider91() { - setSelectedId(""); - setForm({ - ...emptyForm, - id: "spider91", - name: "91 爬虫", - builtin: "spider91", - scriptPath: "", - targetNew: "15", - }); - setScriptURL(""); + setTestResult(null); setMode("detail"); } @@ -101,6 +108,7 @@ export function CrawlersPage() { setSelectedId(""); setForm(emptyForm); setScriptURL(""); + setTestResult(null); setMode("list"); } @@ -110,33 +118,24 @@ export function CrawlersPage() { async function save() { const id = form.id.trim(); - const name = form.name.trim(); - if (!id || !name) { - show("请填写爬虫 ID 和名称", "error"); - return; - } - if (!form.builtin && !form.scriptPath.trim()) { + if (!form.scriptPath.trim()) { show("请先导入爬虫脚本", "error"); return; } setSaving(true); try { const resp = await api.upsertCrawler({ - id, - name, - builtin: form.builtin, + id: id || undefined, scriptPath: form.scriptPath.trim(), - pythonPath: form.pythonPath.trim(), targetNew: form.targetNew.trim(), proxy: form.proxy.trim(), - configJson: form.configJson.trim(), }); if (resp.warning) { show(`已保存,但初始化失败:${resp.warning}`, "error"); } else { show("已保存", "success"); } - setSelectedId(id); + setSelectedId(resp.id || id); await refresh(); setMode("list"); } catch (e) { @@ -152,6 +151,8 @@ export function CrawlersPage() { try { const resp = await api.importCrawlerScriptFile(file); set("scriptPath", resp.scriptPath); + set("name", resp.name); + setTestResult(null); show("脚本已导入", "success"); } catch (e) { show(e instanceof Error ? e.message : "导入失败", "error"); @@ -170,7 +171,9 @@ export function CrawlersPage() { try { const resp = await api.importCrawlerScriptURL(url); set("scriptPath", resp.scriptPath); + set("name", resp.name); setScriptURL(""); + setTestResult(null); show("脚本已导入", "success"); } catch (e) { show(e instanceof Error ? e.message : "导入失败", "error"); @@ -179,6 +182,32 @@ export function CrawlersPage() { } } + async function testScript() { + const scriptPath = form.scriptPath.trim(); + if (!scriptPath) { + show("请先导入爬虫脚本", "error"); + return; + } + setTestingScript(true); + setTestResult(null); + try { + const result = await api.testCrawlerScript({ + scriptPath, + proxy: form.proxy.trim(), + }); + setTestResult(result); + if (result.ok) { + show("测试通过", "success"); + } else { + show(crawlerTestFailure(result) || "测试失败", "error"); + } + } catch (e) { + show(e instanceof Error ? e.message : "测试失败", "error"); + } finally { + setTestingScript(false); + } + } + async function run(crawler: api.AdminCrawler) { setRunningId(crawler.id); try { @@ -210,10 +239,16 @@ export function CrawlersPage() { } async function remove(crawler: api.AdminCrawler) { - if (!window.confirm(`删除爬虫 ${crawler.name} 并清理它导入的视频?`)) return; + if (!window.confirm(`删除爬虫 ${crawler.name} 的脚本和配置?已爬取的视频会保留。`)) return; try { const resp = await api.deleteCrawler(crawler.id); - show(`已删除,并清理 ${resp.deletedVideos ?? 0} 个视频`, "success"); + if (resp.warning) { + show(`已删除爬虫配置,但脚本文件清理失败:${resp.warning}`, "error"); + } else if (resp.deletedScript) { + show("已删除爬虫配置和脚本文件,已爬取视频保留", "success"); + } else { + show("已删除爬虫配置,已爬取视频保留", "success"); + } setSelectedId(""); setForm(emptyForm); setMode("list"); @@ -243,63 +278,74 @@ export function CrawlersPage() { {mode === "list" ? ( -
-
- 已配置爬虫 -
- {loading ? ( -
加载中...
- ) : list.length === 0 ? ( -
暂无爬虫
- ) : ( -
- {list.map((crawler) => ( - - ))} +
+
+ } /> + } tone="ok" /> + } tone="info" /> + } tone="error" /> +
+ +
+
+
+ 已配置爬虫 +
+
- )} + {loading ? ( +
+ + 加载中... +
+ ) : list.length === 0 ? ( +
+ + 暂无爬虫 + +
+ ) : ( +
+ {list.map((crawler) => ( + selectCrawler(crawler)} + onRun={() => run(crawler)} + onStop={() => stop(crawler)} + /> + ))} +
+ )} +
) : ( -
-
-
- {selected ? "爬虫配置" : "添加爬虫"} -
-
- {!selected && ( -
- - -
- )} -
- - set("id", e.target.value)} disabled={!!selected} /> +
+
+
+
+ + 基础信息
-
- - set("name", e.target.value)} /> +
+ 脚本名称 + {form.name || "导入脚本后自动读取"}
- {!form.builtin && ( +
+ +
+
+ + 脚本导入与测试 +
+
@@ -327,19 +373,59 @@ export function CrawlersPage() { +
{form.scriptPath &&
脚本已导入
} + {testResult && }
- )} -
- - set("targetNew", e.target.value)} placeholder="10" />
-
- - set("proxy", e.target.value)} placeholder="http://127.0.0.1:7890" /> +
+ +
+
+ + 运行参数
-
+
+
+ + set("targetNew", e.target.value)} placeholder="10" /> +
+
+ + { + set("proxy", e.target.value); + setTestResult(null); + }} + placeholder="http://127.0.0.1:7890" + /> +
+
+
+
+ +
- {selected && ( -
-
- 状态 -
-
- - - - + {selected && ( +
+
+ + 任务状态 +
+
+ + + + +
+ {selected.lastError &&
{selected.lastError}
}
- {selected.lastError &&
{selected.lastError}
} -
- )} + )} +
)} ); } +function CrawlerMetric({ label, value, icon, tone }: { label: string; value: number; icon: ReactNode; tone?: "ok" | "info" | "error" }) { + return ( +
+ {icon} + {label} + {value} +
+ ); +} + +function CrawlerRow({ + crawler, + active, + running, + stopping, + onSelect, + onRun, + onStop, +}: { + crawler: api.AdminCrawler; + active: boolean; + running: boolean; + stopping: boolean; + onSelect: () => void; + onRun: () => void; + onStop: () => void; +}) { + return ( +
+ +
+ + + + +
+
+ {crawler.targetNew || "10"} 条 + {formatLastCrawl(crawler.lastCrawlAt)} +
+
+ + + +
+
+ ); +} + +function CrawlerStateChip({ label, status }: { label: string; status?: api.DriveGenerationStatus }) { + const state = status?.state || "idle"; + return ( + + {label} · {label === "抓取" && state === "scanning" ? "抓取中" : generationStateLabel(state)} + + ); +} + function CrawlerStatus({ label, status }: { label: string; status?: api.DriveGenerationStatus }) { const state = status?.state || "idle"; const labelText = label === "抓取" && state === "scanning" ? "抓取中" : generationStateLabel(state); @@ -400,3 +563,89 @@ function CrawlerStatus({ label, status }: { label: string; status?: api.DriveGen
); } + +function CrawlerTestResult({ result }: { result: api.CrawlerDryRunResult }) { + const item = result.items[0]; + const failure = crawlerTestFailure(result); + const media = result.mediaCheck; + const statusText = result.ok ? "测试通过" : "测试失败"; + + return ( +
+
+ {statusText} + 抓取到 {result.items.length} 条视频 + {result.durationMs > 0 && {Math.round(result.durationMs / 1000)} 秒} +
+ + {failure &&
{failure}
} + + {item && ( +
+ + + + + +
+ )} + + {media && ( +
+ 直链校验 + + {media.ok ? "可访问" : "不可访问"} + {media.status ? ` · HTTP ${media.status}` : ""} + {media.contentType ? ` · ${media.contentType}` : ""} + {media.contentLengthBytes ? ` · ${formatBytes(media.contentLengthBytes)}` : ""} + +
+ )} + + {result.log && result.log.length > 0 && ( +
+ 脚本日志 +
{result.log.join("\n")}
+
+ )} +
+ ); +} + +function CrawlerTestField({ label, value }: { label: string; value?: string | number }) { + if (value === undefined || value === "") return null; + return ( +
+ {label} + {value} +
+ ); +} + +function crawlerTestFailure(result: api.CrawlerDryRunResult) { + return result.error || result.mediaCheck?.error || ""; +} + +function crawlerStatusLabel(crawler: api.AdminCrawler) { + if (crawler.status === "ok") return "已就绪"; + if (crawler.status === "error") return "错误"; + return "未连接"; +} + +function formatLastCrawl(ts?: number) { + if (!ts) return "未抓取"; + return new Date(ts * 1000).toLocaleString("zh-CN", { + month: "2-digit", + day: "2-digit", + hour: "2-digit", + minute: "2-digit", + }); +} + +function formatBytes(bytes: number) { + if (!Number.isFinite(bytes) || bytes <= 0) return ""; + if (bytes >= 1024 * 1024 * 1024) return `${(bytes / 1024 / 1024 / 1024).toFixed(1)} GB`; + if (bytes >= 1024 * 1024) return `${(bytes / 1024 / 1024).toFixed(1)} MB`; + if (bytes >= 1024) return `${(bytes / 1024).toFixed(1)} KB`; + return `${bytes} B`; +} diff --git a/src/admin/api.ts b/src/admin/api.ts index 8bc56a8..25b9b19 100644 --- a/src/admin/api.ts +++ b/src/admin/api.ts @@ -195,14 +195,11 @@ export type AdminCrawler = { id: string; name: string; kind: "scriptcrawler" | "spider91"; - builtin?: string; status: string; lastError?: string; scriptPath: string; - pythonPath?: string; proxy?: string; targetNew?: string; - configJson?: string; lastCrawlAt?: number; scanGenerationStatus?: DriveGenerationStatus; thumbnailGenerationStatus?: DriveGenerationStatus; @@ -220,18 +217,41 @@ export type AdminCrawler = { }; export type UpsertCrawlerInput = { - id: string; - name: string; - builtin?: string; + id?: string; scriptPath: string; - pythonPath?: string; proxy?: string; targetNew?: string; - configJson?: string; }; export type ImportCrawlerScriptResult = { scriptPath: string; + name: string; +}; + +export type CrawlerDryRunItem = { + title: string; + sourceId?: string; + mediaUrl?: string; + mediaLocalFile?: string; + thumbnailUrl?: string; + detailUrl?: string; +}; + +export type CrawlerDryRunMediaCheck = { + ok: boolean; + status?: number; + contentType?: string; + contentLengthBytes?: number; + error?: string; +}; + +export type CrawlerDryRunResult = { + ok: boolean; + items: CrawlerDryRunItem[]; + mediaCheck?: CrawlerDryRunMediaCheck; + error?: string; + log?: string[]; + durationMs: number; }; export function listCrawlers() { @@ -239,7 +259,7 @@ export function listCrawlers() { } export function upsertCrawler(body: UpsertCrawlerInput) { - return request<{ ok: boolean; warning?: string }>("/crawlers", { + return request<{ ok: boolean; id: string; warning?: string }>("/crawlers", { method: "POST", body: JSON.stringify(body), }); @@ -261,6 +281,13 @@ export function importCrawlerScriptURL(url: string) { }); } +export function testCrawlerScript(body: { scriptPath: string; proxy?: string }) { + return request("/crawlers/test-script", { + method: "POST", + body: JSON.stringify(body), + }); +} + export function runCrawler(id: string) { return request<{ ok: boolean; accepted: boolean; message?: string; status?: NightlyJobStatus }>( `/crawlers/${encodeURIComponent(id)}/run`, @@ -276,9 +303,8 @@ export function stopCrawlerTasks(id: string) { } export function deleteCrawler(id: string) { - return request<{ ok: boolean; deletedVideos: number }>(`/crawlers/${encodeURIComponent(id)}`, { + return request<{ ok: boolean; deletedVideos: number; deletedScript?: boolean; warning?: string }>(`/crawlers/${encodeURIComponent(id)}`, { method: "DELETE", - body: JSON.stringify({ deleteVideos: true }), }); } diff --git a/src/admin/drive/constants.ts b/src/admin/drive/constants.ts index b87919a..7e7e444 100644 --- a/src/admin/drive/constants.ts +++ b/src/admin/drive/constants.ts @@ -163,7 +163,7 @@ export function credentialHelp(kind: Kind, isEdit: boolean): string { case "localstorage": return `填写服务器可访问的本地目录绝对路径,例如 /mnt/videos。系统会扫描该目录及子目录中的视频文件和 .strm 文件;.strm 可指向 HTTP/HTTPS 直链,或指向本地存储根目录内的真实视频路径。Docker 部署时请填写容器内路径。${note}`; case "spider91": - return "91Spider 不再支持通过网盘添加或编辑。请到后台爬虫管理页面添加内置 91 或自定义爬虫脚本。"; + return "91Spider 不再支持通过网盘添加或编辑。请到后台爬虫管理页面添加爬虫脚本。"; default: return ""; } diff --git a/src/styles/admin.css b/src/styles/admin.css index 468f3b1..6be72b1 100644 --- a/src/styles/admin.css +++ b/src/styles/admin.css @@ -335,6 +335,541 @@ margin-bottom: var(--space-3); } +/* ========================================================= + * Crawler Management + * ========================================================= */ +.admin-crawler-console { + display: grid; + gap: var(--space-4); +} + +.admin-crawler-overview { + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: var(--space-3); +} + +.admin-crawler-metric { + display: grid; + grid-template-columns: 38px minmax(0, 1fr); + grid-template-areas: + "icon label" + "icon value"; + align-items: center; + min-height: 76px; + padding: var(--space-4); + border: 1px solid var(--border-subtle); + border-radius: var(--radius-sm); + background: var(--bg-surface); + box-shadow: var(--shadow-sm); +} + +.admin-crawler-metric__icon { + grid-area: icon; + width: 34px; + height: 34px; + display: grid; + place-items: center; + border-radius: var(--radius-xs); + color: var(--accent); + background: var(--accent-soft); +} + +.admin-crawler-metric span:not(.admin-crawler-metric__icon) { + grid-area: label; + color: var(--text-faint); + font-size: var(--font-xs); + font-weight: var(--weight-medium); +} + +.admin-crawler-metric strong { + grid-area: value; + color: var(--text-strong); + font-size: var(--font-2xl); + font-weight: var(--weight-bold); + line-height: 1.1; + font-variant-numeric: tabular-nums; +} + +.admin-crawler-metric.is-ok .admin-crawler-metric__icon { + color: var(--success); + background: var(--success-soft); +} + +.admin-crawler-metric.is-info .admin-crawler-metric__icon { + color: var(--info); + background: var(--info-soft); +} + +.admin-crawler-metric.is-error .admin-crawler-metric__icon { + color: var(--danger); + background: var(--danger-soft); +} + +.admin-crawler-list { + padding: 0; + overflow: hidden; +} + +.admin-crawler-list__head { + display: flex; + align-items: center; + justify-content: space-between; + gap: var(--space-3); + padding: var(--space-4) var(--space-5); + border-bottom: 1px solid var(--border-subtle); +} + +.admin-crawler-list__head .admin-card__title { + margin-bottom: 0; +} + +.admin-spin { + animation: admin-update-spin 0.9s linear infinite; + transform-box: fill-box; + transform-origin: center; + will-change: transform; +} + +.admin-crawler-empty { + min-height: 280px; + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + gap: var(--space-3); + padding: var(--space-7) var(--space-4); + color: var(--text-faint); + text-align: center; +} + +.admin-crawler-empty svg { + color: var(--accent); +} + +.admin-crawler-empty strong { + color: var(--text-strong); + font-size: var(--font-lg); +} + +.admin-crawler-table { + display: grid; +} + +.admin-crawler-row { + display: grid; + grid-template-columns: minmax(260px, 1.25fr) minmax(260px, 1fr) minmax(150px, 0.55fr) auto; + align-items: center; + gap: var(--space-3); + padding: var(--space-3) var(--space-5); + border-bottom: 1px solid var(--border-subtle); + background: transparent; + transition: background var(--transition-fast), border-color var(--transition-fast); +} + +.admin-crawler-row:last-child { + border-bottom: 0; +} + +.admin-crawler-row:hover, +.admin-crawler-row.is-active { + background: rgba(255, 255, 255, 0.025); +} + +.admin-crawler-row.is-active { + box-shadow: inset 3px 0 0 var(--accent); +} + +.admin-crawler-row__main { + appearance: none; + width: 100%; + min-width: 0; + display: grid; + grid-template-columns: 38px minmax(0, 1fr) auto 18px; + align-items: center; + gap: var(--space-3); + padding: 0; + border: 0; + background: transparent; + color: inherit; + font: inherit; + text-align: left; + cursor: pointer; +} + +.admin-crawler-row__main:focus-visible { + outline: 2px solid var(--accent); + outline-offset: 4px; + border-radius: var(--radius-sm); +} + +.admin-crawler-row__brand { + width: 38px; + height: 38px; + display: grid; + place-items: center; + border-radius: var(--radius-xs); + color: var(--accent); + background: var(--accent-soft); + border: 1px solid rgba(255, 138, 60, 0.2); +} + +.admin-crawler-row__title-wrap { + min-width: 0; + display: grid; + gap: 3px; +} + +.admin-crawler-row__title-wrap strong { + min-width: 0; + color: var(--text-strong); + font-size: var(--font-md); + font-weight: var(--weight-semibold); + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.admin-crawler-row__title-wrap span { + color: var(--text-faint); + font-size: var(--font-xs); +} + +.admin-crawler-row__chevron { + color: var(--text-faint); + transition: transform var(--transition-fast), color var(--transition-fast); +} + +.admin-crawler-row__main:hover .admin-crawler-row__chevron { + color: var(--text-strong); + transform: translateX(2px); +} + +.admin-crawler-row__states { + display: flex; + flex-wrap: wrap; + align-items: center; + gap: 6px; + min-width: 0; +} + +.admin-crawler-state-chip { + display: inline-flex; + align-items: center; + min-height: 24px; + padding: 3px 8px; + border-radius: var(--radius-pill); + border: 1px solid transparent; + font-size: var(--font-xs); + font-weight: var(--weight-medium); + white-space: nowrap; +} + +.admin-crawler-state-chip.is-idle { + color: var(--success); + background: var(--success-soft); +} + +.admin-crawler-state-chip.is-generating { + color: var(--info); + background: var(--info-soft); +} + +.admin-crawler-state-chip.is-cooling { + color: var(--warning); + background: var(--warning-soft); +} + +.admin-crawler-state-chip.is-queued { + color: var(--text-muted); + background: rgba(255, 255, 255, 0.06); + border-color: var(--border-subtle); +} + +.admin-crawler-row__meta { + display: grid; + gap: 6px; + color: var(--text-muted); + font-size: var(--font-xs); +} + +.admin-crawler-row__meta span { + display: inline-flex; + align-items: center; + gap: 6px; + min-width: 0; + white-space: nowrap; +} + +.admin-crawler-row__actions { + display: flex; + align-items: center; + justify-content: flex-end; + flex-wrap: wrap; + gap: var(--space-2); +} + +.admin-crawler-editor { + display: grid; + grid-template-columns: minmax(0, 1fr) 320px; + gap: var(--space-5); + align-items: start; +} + +.admin-crawler-editor__main { + display: grid; + gap: var(--space-4); + min-width: 0; +} + +.admin-crawler-editor__side { + display: grid; + gap: var(--space-4); + position: sticky; + top: var(--space-5); + min-width: 0; +} + +.admin-crawler-section, +.admin-crawler-action-panel, +.admin-crawler-side-panel { + border: 1px solid var(--border-subtle); + border-radius: var(--radius-sm); + background: var(--bg-surface); + box-shadow: var(--shadow-sm); +} + +.admin-crawler-section { + padding: var(--space-5); +} + +.admin-crawler-section__head { + display: flex; + align-items: center; + gap: var(--space-2); + margin-bottom: var(--space-4); +} + +.admin-crawler-section__icon { + width: 28px; + height: 28px; + display: grid; + place-items: center; + border-radius: var(--radius-xs); + color: var(--accent); + background: var(--accent-soft); + flex: 0 0 auto; +} + +.admin-crawler-section__title { + color: var(--text-strong); + font-size: var(--font-md); + font-weight: var(--weight-semibold); +} + +.admin-crawler-section .admin-form { + max-width: 100%; +} + +.admin-crawler-script-name { + display: grid; + grid-template-columns: 92px minmax(0, 1fr); + gap: var(--space-3); + align-items: center; + min-height: 42px; + padding: var(--space-3); + border: 1px solid var(--border-subtle); + border-radius: var(--radius-sm); + background: var(--bg-sunken); +} + +.admin-crawler-script-name span { + color: var(--text-faint); + font-size: var(--font-xs); + font-weight: var(--weight-medium); +} + +.admin-crawler-script-name strong { + min-width: 0; + color: var(--text-strong); + font-size: var(--font-sm); + font-weight: var(--weight-semibold); + overflow-wrap: anywhere; +} + +.admin-crawler-params { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: var(--space-4); +} + +.admin-crawler-action-panel { + padding: var(--space-4); +} + +.admin-crawler-action-panel__head { + display: flex; + align-items: center; + gap: var(--space-3); + padding-bottom: var(--space-4); + border-bottom: 1px solid var(--border-subtle); +} + +.admin-crawler-action-panel__mark { + width: 42px; + height: 42px; + display: grid; + place-items: center; + border-radius: var(--radius-xs); + color: var(--text-on-accent); + background: var(--accent); + box-shadow: var(--shadow-sm); +} + +.admin-crawler-action-panel__head > div { + display: grid; + gap: 3px; + min-width: 0; +} + +.admin-crawler-action-panel__head strong { + color: var(--text-strong); + font-size: var(--font-md); + font-weight: var(--weight-semibold); +} + +.admin-crawler-action-panel__head > div span { + color: var(--text-faint); + font-size: var(--font-xs); +} + +.admin-crawler-action-panel__buttons { + display: grid; + gap: var(--space-2); + padding-top: var(--space-4); +} + +.admin-crawler-action-panel__buttons .admin-btn { + width: 100%; +} + +.admin-crawler-side-panel { + padding: var(--space-4); +} + +.admin-crawler-side-panel .admin-crawler-section__head { + margin-bottom: var(--space-3); +} + +.admin-crawler-status-grid { + display: grid; + gap: var(--space-3); +} + +.admin-crawler-status-grid .admin-gen-col { + background: var(--bg-sunken); +} + +.admin-btn[aria-disabled="true"] { + opacity: 0.45; + cursor: not-allowed; + pointer-events: none; +} + +@media (max-width: 1180px) { + .admin-crawler-row { + grid-template-columns: minmax(260px, 1fr) minmax(220px, 0.9fr); + } + + .admin-crawler-row__meta, + .admin-crawler-row__actions { + grid-column: 1 / -1; + } + + .admin-crawler-row__meta { + display: flex; + flex-wrap: wrap; + gap: var(--space-3); + } + + .admin-crawler-row__actions { + justify-content: flex-start; + } +} + +@media (max-width: 1024px) { + .admin-crawler-editor { + grid-template-columns: 1fr; + } + + .admin-crawler-editor__side { + position: static; + } + + .admin-crawler-action-panel__buttons { + grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); + } +} + +@media (max-width: 760px) { + .admin-crawler-overview { + grid-template-columns: repeat(2, minmax(0, 1fr)); + } + + .admin-crawler-list__head { + align-items: stretch; + flex-direction: column; + } + + .admin-crawler-list__head .admin-btn { + width: 100%; + } + + .admin-crawler-row { + grid-template-columns: 1fr; + padding: var(--space-4); + } + + .admin-crawler-row__main { + grid-template-columns: 38px minmax(0, 1fr) auto; + } + + .admin-crawler-row__main .admin-status { + justify-self: start; + grid-column: 2 / 3; + } + + .admin-crawler-row__chevron { + grid-column: 3 / 4; + grid-row: 1 / 3; + } + + .admin-crawler-params { + grid-template-columns: 1fr; + } + + .admin-crawler-script-name { + grid-template-columns: 1fr; + gap: 4px; + } + + .admin-crawler-row__actions .admin-btn { + flex: 1 1 120px; + } +} + +@media (max-width: 520px) { + .admin-crawler-overview { + grid-template-columns: 1fr; + } + + .admin-crawler-section, + .admin-crawler-action-panel, + .admin-crawler-side-panel { + padding: var(--space-4); + } +} + /* ----- Storage summary ----- */ .admin-storage-summary { display: grid; @@ -475,7 +1010,7 @@ .admin-crawler-import { display: grid; - grid-template-columns: auto minmax(180px, 1fr) auto; + grid-template-columns: auto minmax(180px, 1fr) auto auto; gap: var(--space-2); align-items: center; } @@ -494,6 +1029,87 @@ line-height: var(--line-relaxed); } +.admin-crawler-test-result { + display: grid; + gap: var(--space-3); + margin-top: var(--space-3); + padding: var(--space-3); + border: 1px solid var(--border-subtle); + border-radius: var(--radius-sm); + background: var(--bg-elevated); +} + +.admin-crawler-test-result.is-ok { + border-color: var(--success); +} + +.admin-crawler-test-result.is-error { + border-color: var(--danger); +} + +.admin-crawler-test-result__head { + display: flex; + align-items: center; + gap: var(--space-2); + flex-wrap: wrap; + font-size: var(--font-xs); + color: var(--text-muted); +} + +.admin-crawler-test-result__error { + padding: var(--space-2) var(--space-3); + border-radius: var(--radius-sm); + background: var(--danger-soft); + color: var(--danger); + font-size: var(--font-sm); + line-height: var(--line-relaxed); + word-break: break-word; +} + +.admin-crawler-test-result__grid { + display: grid; + gap: var(--space-2); +} + +.admin-crawler-test-result__field, +.admin-crawler-test-result__media { + display: grid; + grid-template-columns: 82px minmax(0, 1fr); + gap: var(--space-2); + align-items: baseline; + font-size: var(--font-xs); +} + +.admin-crawler-test-result__field span, +.admin-crawler-test-result__media span { + color: var(--text-faint); +} + +.admin-crawler-test-result__field strong, +.admin-crawler-test-result__media strong { + color: var(--text-strong); + font-weight: var(--weight-medium); + min-width: 0; + overflow-wrap: anywhere; +} + +.admin-crawler-test-result__log { + font-size: var(--font-xs); + color: var(--text-muted); +} + +.admin-crawler-test-result__log summary { + cursor: pointer; +} + +.admin-crawler-test-result__log pre { + margin: var(--space-2) 0 0; + max-height: 180px; + overflow: auto; + white-space: pre-wrap; + color: var(--text-muted); +} + .admin-p123-qr { display: grid; gap: var(--space-3); diff --git a/tests/adminDriveForm.test.ts b/tests/adminDriveForm.test.ts index 9f1044a..31cab3b 100644 --- a/tests/adminDriveForm.test.ts +++ b/tests/adminDriveForm.test.ts @@ -221,18 +221,29 @@ test("crawler management is a separate admin section", () => { assert.match(crawlerPageSource, /api\.deleteCrawler/); assert.match(crawlerPageSource, /api\.importCrawlerScriptFile/); assert.match(crawlerPageSource, /api\.importCrawlerScriptURL/); + assert.match(crawlerPageSource, /api\.testCrawlerScript/); assert.match(crawlerPageSource, /type="file"/); assert.match(crawlerPageSource, /链接导入/); + assert.match(crawlerPageSource, /测试脚本/); + assert.match(crawlerPageSource, /测试通过/); assert.doesNotMatch(crawlerPageSource, /新建脚本/); + assert.doesNotMatch(crawlerPageSource, /爬虫 ID/); + assert.doesNotMatch(crawlerPageSource, /crawler-id/); + assert.doesNotMatch(crawlerPageSource, /crawler-name/); assert.doesNotMatch(crawlerPageSource, /脚本路径/); assert.doesNotMatch(crawlerPageSource, /Python 解释器/); assert.doesNotMatch(crawlerPageSource, /自定义配置 JSON/); assert.doesNotMatch(crawlerPageSource, /Bot/); - assert.match(crawlerPageSource, /builtin:\s*"spider91"/); + // 项目不再内置任何爬虫:不允许出现内置 91 预设 + assert.doesNotMatch(crawlerPageSource, /builtin/); + assert.doesNotMatch(crawlerPageSource, /内置 91/); assert.match(apiSource, /type AdminCrawler/); assert.match(apiSource, /"\/crawlers"/); assert.match(apiSource, /"\/crawlers\/import-file"/); assert.match(apiSource, /"\/crawlers\/import-url"/); + assert.match(apiSource, /"\/crawlers\/test-script"/); + assert.match(apiSource, /type CrawlerDryRunResult/); + assert.match(apiSource, /id\?: string/); assert.match(apiSource, /new FormData\(\)/); assert.doesNotMatch(driveFormSource, /scriptcrawler/); });