mirror of
https://github.com/nianzhibai/91.git
synced 2026-06-15 00:44:30 +08:00
feat(crawler): simplify script crawler workflow
Redesign crawler management around imported Python scripts instead of built-in crawler storage. Crawler scripts now declare CRAWLER_NAME, imports validate metadata, crawler IDs are generated internally, and deleted crawler scripts are detached without deleting already imported videos. Add backend support for file and URL script imports, dry-run testing, metadata parsing, safer job paths, original filename preservation, and crawler listing that ignores detached script records. Remove the legacy built-in Spider91 script path flow and hidden Python/config JSON fields from the crawler API. Rework the admin crawler page into an independent crawler console with script import, dry-run testing, status metrics, spider iconography, and simplified controls. Update docs, examples, installer checks, Docker/release packaging, and tests for the new protocol.
This commit is contained in:
@@ -154,6 +154,7 @@ OUTPUT_FILE = "91porn_videos.json"
|
||||
MAX_PAGES = None # 设置为 None 爬取所有页,或设置整数如 5 只爬前5页
|
||||
RESUME = True # 是否跳过输出文件中已存在的 viewkey (断点续爬)
|
||||
MAX_EMPTY_PAGES = 2 # 连续空页数达到此值时停止爬取
|
||||
CRAWLER_NAME = "91Porn"
|
||||
CRAWLER_PROTOCOL = "crawler.v1"
|
||||
# ===================================================
|
||||
|
||||
|
||||
@@ -48,7 +48,6 @@ WORKDIR /opt/video-site-91
|
||||
COPY --from=backend /out/server ./server
|
||||
COPY --from=frontend /app/dist ./dist
|
||||
COPY backend/config.example.yaml ./config.example.yaml
|
||||
COPY 91VideoSpider/ ./91VideoSpider/
|
||||
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
|
||||
|
||||
ARG VERSION=dev
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
- **多后端支持** — 兼容 115 云盘、PikPak 云盘、123云盘、OneDrive、Google Drive 和本地存储
|
||||
- **低带宽播放** — 115 云盘、PikPak 云盘、123云盘、OneDrive 都支持302模式,在线播放视频时,不占用服务器带宽,播放体验不受服务器带宽影响;Google Drive 不支持302模式,走服务器中转,观看体验会受服务器带宽影响
|
||||
- **封面 & 预览片段** — 自动为每个视频生成封面图和预览片段,首页快速选片
|
||||
- **91 爬虫** — 内置爬虫,支持抓取 91 本月最热视频
|
||||
- **脚本爬虫** — 不内置任何爬虫,支持在后台导入自定义爬虫脚本(上传 `.py` 文件 / 链接导入 / 服务器路径),按统一协议抓取视频
|
||||
- **双主题** — 黑黄经典主题 / 粉白清新主题,随时切换
|
||||
- **短视频模式** — 一键切换抖音风格,沉浸刷片
|
||||
- **低资源占用** — 2C2G 服务器稳定运行,主要性能消耗就是封面图和预览视频的生成
|
||||
|
||||
+1
-1
@@ -84,7 +84,7 @@ go run ./cmd/server 后端 9192
|
||||
|
||||
爬虫现在是独立后台栏目 `/admin/crawlers`,不再作为“网盘/存储类型”配置。脚本负责发现视频,后端负责去重、下载、入库、封面、预览视频和视频指纹。
|
||||
|
||||
脚本协议见 [docs/crawler-protocol.md](../docs/crawler-protocol.md)。后台支持上传 `.py` 文件或通过 HTTP(S) 脚本链接导入,导入后的脚本会保存到数据目录旁的 `crawler-scripts/`。内置 91 爬虫也支持同一套 `crawler.v1` job 协议;后台“内置 91”会自动使用仓库里的 `91VideoSpider/spider_91porn.py`。
|
||||
脚本协议见 [docs/crawler-protocol.md](../docs/crawler-protocol.md)。后台支持上传 `.py` 文件或通过 HTTP(S) 脚本链接导入,导入后的脚本会保存到数据目录旁的 `crawler-scripts/`。脚本必须声明 `CRAWLER_NAME`,后台会自动读取它作为爬虫名称。项目不内置任何爬虫脚本,所有爬虫都由用户自行导入。
|
||||
|
||||
## 添加一个盘
|
||||
|
||||
|
||||
@@ -239,9 +239,6 @@ func main() {
|
||||
SetSpider91UploadDriveID: func(id string) error {
|
||||
return app.SetSpider91UploadDriveID(ctx, id)
|
||||
},
|
||||
DefaultSpider91ScriptPath: func() string {
|
||||
return app.defaultSpider91ScriptPath()
|
||||
},
|
||||
OnRunNightlyJob: func() bool {
|
||||
if app.nightlyRunner != nil {
|
||||
return app.nightlyRunner.TriggerNow()
|
||||
@@ -881,30 +878,6 @@ func (a *App) commonThumbsDir() string {
|
||||
return filepath.Join(a.cfg.Storage.LocalPreviewDir, "thumbs")
|
||||
}
|
||||
|
||||
// defaultSpider91ScriptPath 推断仓库里爬虫脚本的默认路径。
|
||||
// 当前进程从 backend/ 启动时,脚本位于 ../91VideoSpider/spider_91porn.py。
|
||||
// 找不到时返回空字符串,上层会在 RunOnce 时报错提示用户手动填 script_path。
|
||||
func (a *App) defaultSpider91ScriptPath() string {
|
||||
candidates := []string{
|
||||
// 优先从配置目录的父目录定位
|
||||
filepath.Join(filepath.Dir(filepath.Dir(a.cfg.Storage.LocalPreviewDir)), "91VideoSpider", "spider_91porn.py"),
|
||||
// 仓库 root(cwd 在 backend/ 时)
|
||||
filepath.Join("..", "91VideoSpider", "spider_91porn.py"),
|
||||
// cwd 已经是仓库 root 时
|
||||
filepath.Join("91VideoSpider", "spider_91porn.py"),
|
||||
}
|
||||
for _, p := range candidates {
|
||||
abs, err := filepath.Abs(p)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if _, err := os.Stat(abs); err == nil {
|
||||
return abs
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// attachScriptCrawler 创建通用脚本爬虫 runner,并注册到 a.scriptCrawlers。
|
||||
func (a *App) attachScriptCrawler(d *catalog.Drive, drv *scriptcrawler.Driver) {
|
||||
pythonPath := strings.TrimSpace(d.Credentials["python_path"])
|
||||
@@ -913,9 +886,6 @@ func (a *App) attachScriptCrawler(d *catalog.Drive, drv *scriptcrawler.Driver) {
|
||||
}
|
||||
scriptPath := strings.TrimSpace(d.Credentials["script_path"])
|
||||
sourceKind := scriptCrawlerSourceKindForDrive(d)
|
||||
if scriptPath == "" && sourceKind == spider91.Kind {
|
||||
scriptPath = a.defaultSpider91ScriptPath()
|
||||
}
|
||||
proxyURL := strings.TrimSpace(d.Credentials["proxy"])
|
||||
configJSON := strings.TrimSpace(d.Credentials["config_json"])
|
||||
workDir := ""
|
||||
@@ -2442,7 +2412,7 @@ func (a *App) listSpider91DriveIDs(ctx context.Context) []string {
|
||||
}
|
||||
out := make([]string, 0, len(all))
|
||||
for _, d := range all {
|
||||
if d != nil && d.Kind == scriptcrawler.Kind {
|
||||
if d != nil && d.Kind == scriptcrawler.Kind && strings.TrimSpace(d.Credentials["script_path"]) != "" {
|
||||
out = append(out, d.ID)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -609,7 +609,8 @@ func TestNightlyTargetsComeFromCatalogBeforeDriveAttach(t *testing.T) {
|
||||
{ID: "115", Kind: "p115", Name: "115", RootID: "0", TeaserEnabled: true},
|
||||
{ID: "pikpak", Kind: "pikpak", Name: "PikPak", RootID: "0", TeaserEnabled: true},
|
||||
{ID: "91-legacy", Kind: "spider91", Name: "91 Legacy", RootID: "0", TeaserEnabled: true},
|
||||
{ID: "91-crawler", Kind: scriptcrawler.Kind, Name: "91 Spider", RootID: "/", TeaserEnabled: true},
|
||||
{ID: "91-crawler", Kind: scriptcrawler.Kind, Name: "91 Spider", RootID: "/", Credentials: map[string]string{"script_path": "/tmp/crawler.py"}, TeaserEnabled: true},
|
||||
{ID: "91-crawler-deleted", Kind: scriptcrawler.Kind, Name: "Deleted Spider", RootID: "/", Credentials: map[string]string{}, TeaserEnabled: true},
|
||||
} {
|
||||
if err := cat.UpsertDrive(ctx, d); err != nil {
|
||||
t.Fatalf("seed drive %s: %v", d.ID, err)
|
||||
|
||||
+205
-69
@@ -68,9 +68,6 @@ type AdminServer struct {
|
||||
// Spider91 → 115/123/PikPak/OneDrive 上传目标 drive ID 读写
|
||||
GetSpider91UploadDriveID func() string
|
||||
SetSpider91UploadDriveID func(driveID string) error
|
||||
// DefaultSpider91ScriptPath returns the built-in Spider91 crawler script
|
||||
// path for the independent crawler management UI.
|
||||
DefaultSpider91ScriptPath func() string
|
||||
// OnRunNightlyJob 触发一次完整的凌晨流水线(Phase1 扫盘 + Phase2 91 爬虫 +
|
||||
// Phase3 迁移)。立即返回 —— 实际任务在后台跑,admin 在日志或下次状态查询里
|
||||
// 看进度。若流水线正在跑或已排队,Runner 会拒绝重复触发。
|
||||
@@ -163,6 +160,7 @@ func (a *AdminServer) Register(r chi.Router) {
|
||||
r.Post("/crawlers", a.handleUpsertCrawler)
|
||||
r.Post("/crawlers/import-file", a.handleImportCrawlerScriptFile)
|
||||
r.Post("/crawlers/import-url", a.handleImportCrawlerScriptURL)
|
||||
r.Post("/crawlers/test-script", a.handleTestCrawlerScript)
|
||||
r.Delete("/crawlers/{id}", a.handleDeleteCrawler)
|
||||
r.Post("/crawlers/{id}/run", a.handleRunCrawler)
|
||||
r.Post("/crawlers/{id}/tasks/stop", a.handleStopCrawlerTasks)
|
||||
@@ -441,11 +439,6 @@ func (a *AdminServer) handleListDrives(w http.ResponseWriter, r *http.Request) {
|
||||
// LastCrawlAt 是 spider91 上次成功爬取的 unix 秒(来自 credentials.last_crawl_at)。
|
||||
// 其它 kind 留 0;前端用它显示"上次抓取: N 小时前"。
|
||||
Spider91Proxy string `json:"spider91Proxy,omitempty"`
|
||||
ScriptCrawlerPythonPath string `json:"scriptCrawlerPythonPath,omitempty"`
|
||||
ScriptCrawlerScriptPath string `json:"scriptCrawlerScriptPath,omitempty"`
|
||||
ScriptCrawlerProxy string `json:"scriptCrawlerProxy,omitempty"`
|
||||
ScriptCrawlerTargetNew string `json:"scriptCrawlerTargetNew,omitempty"`
|
||||
ScriptCrawlerConfigJSON string `json:"scriptCrawlerConfigJson,omitempty"`
|
||||
LastCrawlAt int64 `json:"lastCrawlAt,omitempty"`
|
||||
GoogleDriveUseOnlineAPI *bool `json:"googleDriveUseOnlineAPI,omitempty"`
|
||||
ScanGenerationStatus GenerationStatus `json:"scanGenerationStatus"`
|
||||
@@ -513,11 +506,6 @@ func (a *AdminServer) handleListDrives(w http.ResponseWriter, r *http.Request) {
|
||||
TeaserEnabled: d.TeaserEnabled,
|
||||
SkipDirIDs: append([]string{}, d.SkipDirIDs...),
|
||||
Spider91Proxy: spider91ProxyForDrive(d),
|
||||
ScriptCrawlerPythonPath: scriptCrawlerCred(d, "python_path"),
|
||||
ScriptCrawlerScriptPath: scriptCrawlerCred(d, "script_path"),
|
||||
ScriptCrawlerProxy: scriptCrawlerCred(d, "proxy"),
|
||||
ScriptCrawlerTargetNew: scriptCrawlerCred(d, "target_new"),
|
||||
ScriptCrawlerConfigJSON: scriptCrawlerCred(d, "config_json"),
|
||||
LastCrawlAt: lastCrawlAt,
|
||||
GoogleDriveUseOnlineAPI: googleDriveUseOnlineAPIForDrive(d),
|
||||
ScanGenerationStatus: generation.Scan,
|
||||
@@ -637,14 +625,11 @@ type crawlerDTO struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Kind string `json:"kind"`
|
||||
Builtin string `json:"builtin,omitempty"`
|
||||
Status string `json:"status"`
|
||||
LastError string `json:"lastError,omitempty"`
|
||||
ScriptPath string `json:"scriptPath"`
|
||||
PythonPath string `json:"pythonPath,omitempty"`
|
||||
Proxy string `json:"proxy,omitempty"`
|
||||
TargetNew string `json:"targetNew,omitempty"`
|
||||
ConfigJSON string `json:"configJson,omitempty"`
|
||||
LastCrawlAt int64 `json:"lastCrawlAt,omitempty"`
|
||||
ScanGenerationStatus GenerationStatus `json:"scanGenerationStatus"`
|
||||
ThumbnailGenerationStatus GenerationStatus `json:"thumbnailGenerationStatus"`
|
||||
@@ -663,13 +648,9 @@ type crawlerDTO struct {
|
||||
|
||||
type upsertCrawlerReq struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Builtin string `json:"builtin"`
|
||||
ScriptPath string `json:"scriptPath"`
|
||||
PythonPath string `json:"pythonPath"`
|
||||
Proxy string `json:"proxy"`
|
||||
TargetNew string `json:"targetNew"`
|
||||
ConfigJSON string `json:"configJson"`
|
||||
}
|
||||
|
||||
func (a *AdminServer) handleListCrawlers(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -700,7 +681,7 @@ func (a *AdminServer) handleListCrawlers(w http.ResponseWriter, r *http.Request)
|
||||
|
||||
out := []crawlerDTO{}
|
||||
for _, d := range all {
|
||||
if d == nil || !isCrawlerDriveKind(d.Kind) {
|
||||
if d == nil || !isConfiguredCrawlerDrive(d) {
|
||||
continue
|
||||
}
|
||||
out = append(out, a.crawlerDTOForDrive(d, teaserCounts[d.ID], thumbnailCounts[d.ID], fingerprintCounts[d.ID], generationStatuses[d.ID]))
|
||||
@@ -729,16 +710,13 @@ func (a *AdminServer) crawlerDTOForDrive(d *catalog.Drive, teaser catalog.DriveT
|
||||
}
|
||||
return crawlerDTO{
|
||||
ID: d.ID,
|
||||
Name: d.Name,
|
||||
Name: crawlerNameForDrive(d),
|
||||
Kind: d.Kind,
|
||||
Builtin: crawlerBuiltinForDrive(d),
|
||||
Status: d.Status,
|
||||
LastError: d.LastError,
|
||||
ScriptPath: strings.TrimSpace(d.Credentials["script_path"]),
|
||||
PythonPath: strings.TrimSpace(d.Credentials["python_path"]),
|
||||
Proxy: strings.TrimSpace(d.Credentials["proxy"]),
|
||||
TargetNew: strings.TrimSpace(d.Credentials["target_new"]),
|
||||
ConfigJSON: strings.TrimSpace(d.Credentials["config_json"]),
|
||||
LastCrawlAt: lastCrawlAt,
|
||||
ScanGenerationStatus: generation.Scan,
|
||||
ThumbnailGenerationStatus: generation.Thumbnail,
|
||||
@@ -756,11 +734,16 @@ func (a *AdminServer) crawlerDTOForDrive(d *catalog.Drive, teaser catalog.DriveT
|
||||
}
|
||||
}
|
||||
|
||||
func crawlerBuiltinForDrive(d *catalog.Drive) string {
|
||||
func crawlerNameForDrive(d *catalog.Drive) string {
|
||||
if d == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(d.Credentials["builtin"])
|
||||
if d.Credentials != nil {
|
||||
if meta, err := scriptcrawler.ReadMetadata(strings.TrimSpace(d.Credentials["script_path"])); err == nil {
|
||||
return meta.Name
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(d.Name)
|
||||
}
|
||||
|
||||
func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -770,32 +753,21 @@ func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request
|
||||
return
|
||||
}
|
||||
id := strings.TrimSpace(body.ID)
|
||||
name := strings.TrimSpace(body.Name)
|
||||
if id == "" || name == "" {
|
||||
http.Error(w, "id and name are required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
existing, _ := a.Catalog.GetDrive(r.Context(), id)
|
||||
creds := map[string]string{}
|
||||
var existing *catalog.Drive
|
||||
if id != "" {
|
||||
existing, _ = a.Catalog.GetDrive(r.Context(), id)
|
||||
}
|
||||
if existing != nil {
|
||||
for k, v := range existing.Credentials {
|
||||
creds[k] = v
|
||||
}
|
||||
}
|
||||
builtin := strings.TrimSpace(body.Builtin)
|
||||
if builtin != "" {
|
||||
creds["builtin"] = builtin
|
||||
}
|
||||
scriptPath := strings.TrimSpace(body.ScriptPath)
|
||||
if scriptPath == "" && builtin == "spider91" && a.DefaultSpider91ScriptPath != nil {
|
||||
scriptPath = strings.TrimSpace(a.DefaultSpider91ScriptPath())
|
||||
}
|
||||
incoming := map[string]string{
|
||||
"script_path": scriptPath,
|
||||
"python_path": strings.TrimSpace(body.PythonPath),
|
||||
"proxy": strings.TrimSpace(body.Proxy),
|
||||
"target_new": strings.TrimSpace(body.TargetNew),
|
||||
"config_json": strings.TrimSpace(body.ConfigJSON),
|
||||
}
|
||||
for k, v := range incoming {
|
||||
creds[k] = v
|
||||
@@ -805,8 +777,19 @@ func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
if builtin != "" {
|
||||
merged["builtin"] = builtin
|
||||
meta, err := scriptcrawler.ReadMetadata(merged["script_path"])
|
||||
if err != nil {
|
||||
http.Error(w, "脚本元信息无效:"+err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
name := meta.Name
|
||||
if id == "" {
|
||||
generatedID, err := a.generateCrawlerID(r.Context(), name)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err)
|
||||
return
|
||||
}
|
||||
id = generatedID
|
||||
}
|
||||
d := &catalog.Drive{
|
||||
ID: id,
|
||||
@@ -826,11 +809,55 @@ func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request
|
||||
}
|
||||
if a.OnDriveSaved != nil {
|
||||
if err := a.OnDriveSaved(id); err != nil {
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "warning": err.Error()})
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "id": id, "warning": err.Error()})
|
||||
return
|
||||
}
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true})
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "id": id})
|
||||
}
|
||||
|
||||
func (a *AdminServer) generateCrawlerID(ctx context.Context, name string) (string, error) {
|
||||
all, err := a.Catalog.ListDrives(ctx)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
used := map[string]bool{}
|
||||
for _, d := range all {
|
||||
if d == nil {
|
||||
continue
|
||||
}
|
||||
if isCrawlerDriveKind(d.Kind) && strings.TrimSpace(d.Credentials["script_path"]) == "" {
|
||||
continue
|
||||
}
|
||||
used[d.ID] = true
|
||||
}
|
||||
slug := crawlerIDSlug(name)
|
||||
base := "crawler"
|
||||
if slug != "" {
|
||||
base += "-" + slug
|
||||
}
|
||||
candidate := base
|
||||
for suffix := 2; used[candidate]; suffix++ {
|
||||
candidate = fmt.Sprintf("%s-%d", base, suffix)
|
||||
}
|
||||
return candidate, nil
|
||||
}
|
||||
|
||||
func crawlerIDSlug(raw string) string {
|
||||
var b strings.Builder
|
||||
lastDash := false
|
||||
for _, r := range strings.ToLower(raw) {
|
||||
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
|
||||
b.WriteRune(r)
|
||||
lastDash = false
|
||||
continue
|
||||
}
|
||||
if b.Len() > 0 && !lastDash {
|
||||
b.WriteByte('-')
|
||||
lastDash = true
|
||||
}
|
||||
}
|
||||
return strings.Trim(b.String(), "-")
|
||||
}
|
||||
|
||||
type importCrawlerScriptURLReq struct {
|
||||
@@ -838,6 +865,36 @@ type importCrawlerScriptURLReq struct {
|
||||
FileName string `json:"fileName"`
|
||||
}
|
||||
|
||||
type testCrawlerScriptReq struct {
|
||||
ScriptPath string `json:"scriptPath"`
|
||||
Proxy string `json:"proxy"`
|
||||
}
|
||||
|
||||
// handleTestCrawlerScript 试跑一个爬虫脚本:不入库,抓到第一条视频
|
||||
// (并探测直链可达)即返回,让用户在保存前确认脚本能爬到视频。
|
||||
func (a *AdminServer) handleTestCrawlerScript(w http.ResponseWriter, r *http.Request) {
|
||||
var body testCrawlerScriptReq
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
writeErr(w, http.StatusBadRequest, err)
|
||||
return
|
||||
}
|
||||
scriptPath := strings.TrimSpace(body.ScriptPath)
|
||||
if scriptPath == "" {
|
||||
http.Error(w, "请先导入爬虫脚本", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
proxyURL, err := normalizeCrawlerProxyURL(body.Proxy, "脚本爬虫")
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
result := scriptcrawler.DryRun(r.Context(), scriptcrawler.DryRunConfig{
|
||||
ScriptPath: scriptPath,
|
||||
ProxyURL: proxyURL,
|
||||
})
|
||||
writeJSON(w, http.StatusOK, result)
|
||||
}
|
||||
|
||||
func (a *AdminServer) handleImportCrawlerScriptFile(w http.ResponseWriter, r *http.Request) {
|
||||
r.Body = http.MaxBytesReader(w, r.Body, maxCrawlerScriptBytes+1024*1024)
|
||||
if err := r.ParseMultipartForm(maxCrawlerScriptBytes + 1024*1024); err != nil {
|
||||
@@ -860,7 +917,13 @@ func (a *AdminServer) handleImportCrawlerScriptFile(w http.ResponseWriter, r *ht
|
||||
writeErr(w, http.StatusBadRequest, err)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath})
|
||||
meta, err := scriptcrawler.ReadMetadata(scriptPath)
|
||||
if err != nil {
|
||||
_ = os.Remove(scriptPath)
|
||||
writeErr(w, http.StatusBadRequest, fmt.Errorf("脚本元信息无效: %w", err))
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath, "name": meta.Name})
|
||||
}
|
||||
|
||||
func (a *AdminServer) handleImportCrawlerScriptURL(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -917,7 +980,13 @@ func (a *AdminServer) handleImportCrawlerScriptURL(w http.ResponseWriter, r *htt
|
||||
writeErr(w, http.StatusBadRequest, err)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath})
|
||||
meta, err := scriptcrawler.ReadMetadata(scriptPath)
|
||||
if err != nil {
|
||||
_ = os.Remove(scriptPath)
|
||||
writeErr(w, http.StatusBadRequest, fmt.Errorf("脚本元信息无效: %w", err))
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath, "name": meta.Name})
|
||||
}
|
||||
|
||||
func (a *AdminServer) saveCrawlerScript(ctx context.Context, name string, r io.Reader, maxBytes int64) (string, error) {
|
||||
@@ -935,7 +1004,7 @@ func (a *AdminServer) saveCrawlerScript(ctx context.Context, name string, r io.R
|
||||
if err := os.MkdirAll(root, 0o755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
dst := filepath.Join(root, time.Now().UTC().Format("20060102T150405.000000000Z")+"-"+fileName)
|
||||
dst := filepath.Join(root, fileName)
|
||||
dstAbs, err := filepath.Abs(dst)
|
||||
if err != nil {
|
||||
return "", err
|
||||
@@ -1015,6 +1084,11 @@ func safeCrawlerScriptFileName(raw string) (string, error) {
|
||||
|
||||
func (a *AdminServer) handleRunCrawler(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
d, err := a.Catalog.GetDrive(r.Context(), id)
|
||||
if err != nil || d == nil || !isCrawlerDriveKind(d.Kind) || d.Credentials == nil || strings.TrimSpace(d.Credentials["script_path"]) == "" {
|
||||
http.Error(w, "crawler not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
status := a.nightlyJobStatus()
|
||||
if status.Running || status.Queued {
|
||||
writeJSON(w, http.StatusAccepted, map[string]any{
|
||||
@@ -1041,13 +1115,86 @@ func (a *AdminServer) handleStopCrawlerTasks(w http.ResponseWriter, r *http.Requ
|
||||
}
|
||||
|
||||
func (a *AdminServer) handleDeleteCrawler(w http.ResponseWriter, r *http.Request) {
|
||||
a.handleDeleteDrive(w, r)
|
||||
id := chi.URLParam(r, "id")
|
||||
d, err := a.Catalog.GetDrive(r.Context(), id)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusNotFound, err)
|
||||
return
|
||||
}
|
||||
if !isCrawlerDriveKind(d.Kind) {
|
||||
http.Error(w, "crawler not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
if a.OnStopDriveTasks != nil {
|
||||
a.OnStopDriveTasks(id)
|
||||
}
|
||||
|
||||
deletedScript, scriptErr := a.removeImportedCrawlerScript(d)
|
||||
if d.Credentials == nil {
|
||||
d.Credentials = map[string]string{}
|
||||
}
|
||||
delete(d.Credentials, "script_path")
|
||||
delete(d.Credentials, "proxy")
|
||||
delete(d.Credentials, "target_new")
|
||||
delete(d.Credentials, "builtin")
|
||||
delete(d.Credentials, "python_path")
|
||||
delete(d.Credentials, "config_json")
|
||||
d.Status = "disconnected"
|
||||
d.LastError = ""
|
||||
if err := a.Catalog.UpsertDrive(r.Context(), d); err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err)
|
||||
return
|
||||
}
|
||||
resp := map[string]any{
|
||||
"ok": true,
|
||||
"deletedVideos": 0,
|
||||
"deletedScript": deletedScript,
|
||||
}
|
||||
if scriptErr != nil {
|
||||
resp["warning"] = scriptErr.Error()
|
||||
}
|
||||
writeJSON(w, http.StatusOK, resp)
|
||||
}
|
||||
|
||||
func isCrawlerDriveKind(kind string) bool {
|
||||
return kind == scriptcrawler.Kind
|
||||
}
|
||||
|
||||
func isConfiguredCrawlerDrive(d *catalog.Drive) bool {
|
||||
return d != nil &&
|
||||
isCrawlerDriveKind(d.Kind) &&
|
||||
d.Credentials != nil &&
|
||||
strings.TrimSpace(d.Credentials["script_path"]) != ""
|
||||
}
|
||||
|
||||
func (a *AdminServer) removeImportedCrawlerScript(d *catalog.Drive) (bool, error) {
|
||||
if d == nil || d.Credentials == nil {
|
||||
return false, nil
|
||||
}
|
||||
scriptPath := strings.TrimSpace(d.Credentials["script_path"])
|
||||
if scriptPath == "" {
|
||||
return false, nil
|
||||
}
|
||||
scriptAbs, err := filepath.Abs(scriptPath)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
rootAbs, err := a.crawlerScriptImportDir()
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if scriptAbs == rootAbs || !strings.HasPrefix(scriptAbs, rootAbs+string(os.PathSeparator)) {
|
||||
return false, nil
|
||||
}
|
||||
if err := os.Remove(scriptAbs); err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
return false, nil
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func spider91ProxyForDrive(d *catalog.Drive) string {
|
||||
if d == nil || d.Kind != "spider91" || d.Credentials == nil {
|
||||
return ""
|
||||
@@ -1055,13 +1202,6 @@ func spider91ProxyForDrive(d *catalog.Drive) string {
|
||||
return strings.TrimSpace(d.Credentials["proxy"])
|
||||
}
|
||||
|
||||
func scriptCrawlerCred(d *catalog.Drive, key string) string {
|
||||
if d == nil || d.Kind != scriptcrawler.Kind || d.Credentials == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(d.Credentials[key])
|
||||
}
|
||||
|
||||
func googleDriveUseOnlineAPIForDrive(d *catalog.Drive) *bool {
|
||||
if d == nil || d.Kind != "googledrive" {
|
||||
return nil
|
||||
@@ -1165,23 +1305,16 @@ func mergeScriptCrawlerCredentials(existing *catalog.Drive, incoming map[string]
|
||||
return nil, fmt.Errorf("脚本爬虫 target_new 必须是正整数")
|
||||
}
|
||||
merged[key] = strconv.Itoa(n)
|
||||
case "config_json":
|
||||
case "script_path":
|
||||
if value == "" {
|
||||
delete(merged, key)
|
||||
continue
|
||||
}
|
||||
if !json.Valid([]byte(value)) {
|
||||
return nil, fmt.Errorf("脚本爬虫自定义配置必须是合法 JSON")
|
||||
}
|
||||
merged[key] = value
|
||||
case "python_path", "script_path":
|
||||
if value == "" {
|
||||
if existing == nil || key == "script_path" {
|
||||
if existing == nil {
|
||||
delete(merged, key)
|
||||
}
|
||||
continue
|
||||
}
|
||||
merged[key] = value
|
||||
case "builtin", "python_path", "config_json":
|
||||
delete(merged, key)
|
||||
default:
|
||||
if value == "" {
|
||||
delete(merged, key)
|
||||
@@ -1190,9 +1323,12 @@ func mergeScriptCrawlerCredentials(existing *catalog.Drive, incoming map[string]
|
||||
}
|
||||
}
|
||||
}
|
||||
if strings.TrimSpace(merged["script_path"]) == "" && !strings.EqualFold(strings.TrimSpace(merged["builtin"]), "spider91") {
|
||||
if strings.TrimSpace(merged["script_path"]) == "" {
|
||||
return nil, fmt.Errorf("脚本爬虫必须填写 script_path")
|
||||
}
|
||||
delete(merged, "builtin")
|
||||
delete(merged, "python_path")
|
||||
delete(merged, "config_json")
|
||||
return merged, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -5,10 +5,12 @@ import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -19,6 +21,7 @@ import (
|
||||
|
||||
"github.com/video-site/backend/internal/auth"
|
||||
"github.com/video-site/backend/internal/catalog"
|
||||
"github.com/video-site/backend/internal/drives/scriptcrawler"
|
||||
)
|
||||
|
||||
func TestHandleLoginReturnsForbiddenForBannedIP(t *testing.T) {
|
||||
@@ -843,7 +846,8 @@ func TestHandleDeleteDriveRequiresCleanupConfirmation(t *testing.T) {
|
||||
|
||||
func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
|
||||
tmp := t.TempDir()
|
||||
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open catalog: %v", err)
|
||||
}
|
||||
@@ -852,6 +856,10 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
|
||||
t.Fatalf("close catalog: %v", err)
|
||||
}
|
||||
})
|
||||
scriptPath := filepath.Join(tmp, "spider_91porn.py")
|
||||
if err := os.WriteFile(scriptPath, []byte("CRAWLER_NAME = \"91Porn\"\n"), 0o644); err != nil {
|
||||
t.Fatalf("write crawler script: %v", err)
|
||||
}
|
||||
|
||||
for _, d := range []*catalog.Drive{
|
||||
{
|
||||
@@ -862,7 +870,7 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
|
||||
Credentials: map[string]string{
|
||||
"last_crawl_at": "1800000000",
|
||||
"proxy": " http://127.0.0.1:7890 ",
|
||||
"script_path": "/opt/video-site-91/91VideoSpider/spider_91porn.py",
|
||||
"script_path": scriptPath,
|
||||
},
|
||||
Status: "ok",
|
||||
},
|
||||
@@ -875,7 +883,7 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
|
||||
"builtin": "spider91",
|
||||
"last_crawl_at": "1800000000",
|
||||
"proxy": " http://127.0.0.1:7890 ",
|
||||
"script_path": "/opt/video-site-91/91VideoSpider/spider_91porn.py",
|
||||
"script_path": scriptPath,
|
||||
},
|
||||
Status: "ok",
|
||||
},
|
||||
@@ -889,6 +897,14 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
|
||||
},
|
||||
Status: "ok",
|
||||
},
|
||||
{
|
||||
ID: "crawler-script-deleted",
|
||||
Kind: "scriptcrawler",
|
||||
Name: "Deleted Script",
|
||||
RootID: "/",
|
||||
Credentials: map[string]string{},
|
||||
Status: "disconnected",
|
||||
},
|
||||
} {
|
||||
if err := cat.UpsertDrive(ctx, d); err != nil {
|
||||
t.Fatalf("seed drive %s: %v", d.ID, err)
|
||||
@@ -905,8 +921,8 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
|
||||
|
||||
var got []struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Kind string `json:"kind"`
|
||||
Builtin string `json:"builtin"`
|
||||
Proxy string `json:"proxy"`
|
||||
LastCrawlAt int64 `json:"lastCrawlAt"`
|
||||
}
|
||||
@@ -914,24 +930,30 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
byID := map[string]struct {
|
||||
Name string
|
||||
Kind string
|
||||
Builtin string
|
||||
Proxy string
|
||||
LastCrawlAt int64
|
||||
}{}
|
||||
for _, d := range got {
|
||||
byID[d.ID] = struct {
|
||||
Name string
|
||||
Kind string
|
||||
Builtin string
|
||||
Proxy string
|
||||
LastCrawlAt int64
|
||||
}{Kind: d.Kind, Builtin: d.Builtin, Proxy: d.Proxy, LastCrawlAt: d.LastCrawlAt}
|
||||
}{Name: d.Name, Kind: d.Kind, Proxy: d.Proxy, LastCrawlAt: d.LastCrawlAt}
|
||||
}
|
||||
if _, ok := byID["spider91-main"]; ok {
|
||||
t.Fatal("legacy spider91 drive should not be returned by crawler list")
|
||||
}
|
||||
if byID["crawler-spider91"].Kind != "scriptcrawler" || byID["crawler-spider91"].Builtin != "spider91" {
|
||||
t.Fatalf("crawler kind/builtin = %q/%q, want scriptcrawler/spider91", byID["crawler-spider91"].Kind, byID["crawler-spider91"].Builtin)
|
||||
if _, ok := byID["crawler-script-deleted"]; ok {
|
||||
t.Fatal("crawler without script_path should not be returned by crawler list")
|
||||
}
|
||||
if byID["crawler-spider91"].Kind != "scriptcrawler" {
|
||||
t.Fatalf("crawler kind = %q, want scriptcrawler", byID["crawler-spider91"].Kind)
|
||||
}
|
||||
if byID["crawler-spider91"].Name != "91Porn" {
|
||||
t.Fatalf("crawler name = %q, want script metadata name", byID["crawler-spider91"].Name)
|
||||
}
|
||||
if byID["crawler-spider91"].Proxy != "http://127.0.0.1:7890" {
|
||||
t.Fatalf("crawler proxy = %q, want trimmed proxy", byID["crawler-spider91"].Proxy)
|
||||
@@ -967,9 +989,10 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleUpsertCrawlerAllowsBuiltinSpider91WithoutScriptPath(t *testing.T) {
|
||||
func TestHandleUpsertCrawlerRequiresScriptPath(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
|
||||
tmp := t.TempDir()
|
||||
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open catalog: %v", err)
|
||||
}
|
||||
@@ -979,21 +1002,35 @@ func TestHandleUpsertCrawlerAllowsBuiltinSpider91WithoutScriptPath(t *testing.T)
|
||||
}
|
||||
})
|
||||
|
||||
srv := &AdminServer{Catalog: cat}
|
||||
scriptPath := filepath.Join(tmp, "custom.py")
|
||||
if err := os.WriteFile(scriptPath, []byte("CRAWLER_NAME = \"91 Spider\"\n"), 0o644); err != nil {
|
||||
t.Fatalf("write crawler script: %v", err)
|
||||
}
|
||||
|
||||
// 不再内置任何爬虫:没有脚本路径的保存请求必须被拒绝,
|
||||
// 旧的 builtin 字段也不再有"免脚本"特权。
|
||||
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers", strings.NewReader(`{
|
||||
"id": "spider91-main",
|
||||
"name": "91 Spider",
|
||||
"builtin": "spider91",
|
||||
"scriptPath": "",
|
||||
"pythonPath": "python3",
|
||||
"targetNew": "15"
|
||||
}`))
|
||||
rr := httptest.NewRecorder()
|
||||
(&AdminServer{
|
||||
Catalog: cat,
|
||||
DefaultSpider91ScriptPath: func() string {
|
||||
return ""
|
||||
},
|
||||
}).handleUpsertCrawler(rr, req)
|
||||
srv.handleUpsertCrawler(rr, req)
|
||||
if rr.Code != http.StatusBadRequest {
|
||||
t.Fatalf("status = %d, body = %s, want 400", rr.Code, rr.Body.String())
|
||||
}
|
||||
|
||||
// 带脚本路径时正常保存,且请求中的 builtin 字段被忽略,不会写入凭证。
|
||||
req = httptest.NewRequest(http.MethodPost, "/admin/api/crawlers", strings.NewReader(`{
|
||||
"id": "spider91-main",
|
||||
"builtin": "spider91",
|
||||
"scriptPath": "`+scriptPath+`",
|
||||
"targetNew": "15"
|
||||
}`))
|
||||
rr = httptest.NewRecorder()
|
||||
srv.handleUpsertCrawler(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
@@ -1002,23 +1039,85 @@ func TestHandleUpsertCrawlerAllowsBuiltinSpider91WithoutScriptPath(t *testing.T)
|
||||
if err != nil {
|
||||
t.Fatalf("get crawler drive: %v", err)
|
||||
}
|
||||
if got.Kind != "scriptcrawler" || got.Credentials["builtin"] != "spider91" {
|
||||
t.Fatalf("kind/builtin = %q/%q, want scriptcrawler/spider91", got.Kind, got.Credentials["builtin"])
|
||||
if got.Kind != "scriptcrawler" || got.Credentials["builtin"] != "" {
|
||||
t.Fatalf("kind/builtin = %q/%q, want scriptcrawler with no builtin credential", got.Kind, got.Credentials["builtin"])
|
||||
}
|
||||
if got.Credentials["script_path"] != "" {
|
||||
t.Fatalf("script_path = %q, want empty when default is unavailable", got.Credentials["script_path"])
|
||||
if got.Credentials["python_path"] != "" || got.Credentials["config_json"] != "" {
|
||||
t.Fatalf("legacy hidden credentials should not be saved: %+v", got.Credentials)
|
||||
}
|
||||
if got.Name != "91 Spider" {
|
||||
t.Fatalf("name = %q, want script metadata name", got.Name)
|
||||
}
|
||||
if got.Credentials["script_path"] != scriptPath {
|
||||
t.Fatalf("script_path = %q, want %q", got.Credentials["script_path"], scriptPath)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleUpsertCrawlerGeneratesIDFromScriptName(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
tmp := t.TempDir()
|
||||
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open catalog: %v", err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
if err := cat.Close(); err != nil {
|
||||
t.Fatalf("close catalog: %v", err)
|
||||
}
|
||||
})
|
||||
if err := cat.UpsertDrive(ctx, &catalog.Drive{
|
||||
ID: "crawler-my-spider",
|
||||
Kind: scriptcrawler.Kind,
|
||||
Name: "Existing",
|
||||
RootID: "/",
|
||||
Credentials: map[string]string{"script_path": "/opt/crawlers/existing.py"},
|
||||
}); err != nil {
|
||||
t.Fatalf("seed crawler: %v", err)
|
||||
}
|
||||
scriptPath := filepath.Join(tmp, "custom.py")
|
||||
if err := os.WriteFile(scriptPath, []byte("CRAWLER_NAME = \"My Spider\"\n"), 0o644); err != nil {
|
||||
t.Fatalf("write crawler script: %v", err)
|
||||
}
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers", strings.NewReader(`{
|
||||
"scriptPath": "`+scriptPath+`",
|
||||
"targetNew": "15"
|
||||
}`))
|
||||
rr := httptest.NewRecorder()
|
||||
(&AdminServer{Catalog: cat}).handleUpsertCrawler(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
|
||||
var resp struct {
|
||||
OK bool `json:"ok"`
|
||||
ID string `json:"id"`
|
||||
}
|
||||
if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil {
|
||||
t.Fatalf("decode response: %v", err)
|
||||
}
|
||||
if !resp.OK || resp.ID != "crawler-my-spider-2" {
|
||||
t.Fatalf("response = %+v, want generated suffix id", resp)
|
||||
}
|
||||
got, err := cat.GetDrive(ctx, resp.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("get generated crawler: %v", err)
|
||||
}
|
||||
if got.Name != "My Spider" || got.Kind != scriptcrawler.Kind {
|
||||
t.Fatalf("generated crawler = %+v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleImportCrawlerScriptFile(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
script := "CRAWLER_NAME = \"Demo Crawler\"\nprint('crawler')\n"
|
||||
var body bytes.Buffer
|
||||
mw := multipart.NewWriter(&body)
|
||||
part, err := mw.CreateFormFile("file", "../demo crawler.py")
|
||||
if err != nil {
|
||||
t.Fatalf("create form file: %v", err)
|
||||
}
|
||||
if _, err := part.Write([]byte("print('crawler')\n")); err != nil {
|
||||
if _, err := part.Write([]byte(script)); err != nil {
|
||||
t.Fatalf("write part: %v", err)
|
||||
}
|
||||
if err := mw.Close(); err != nil {
|
||||
@@ -1034,6 +1133,7 @@ func TestHandleImportCrawlerScriptFile(t *testing.T) {
|
||||
}
|
||||
var got struct {
|
||||
ScriptPath string `json:"scriptPath"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
@@ -1045,15 +1145,48 @@ func TestHandleImportCrawlerScriptFile(t *testing.T) {
|
||||
if filepath.Ext(got.ScriptPath) != ".py" {
|
||||
t.Fatalf("script path = %q, want .py", got.ScriptPath)
|
||||
}
|
||||
if filepath.Base(got.ScriptPath) != "demo_crawler.py" {
|
||||
t.Fatalf("script filename = %q, want original sanitized filename", filepath.Base(got.ScriptPath))
|
||||
}
|
||||
data, err := os.ReadFile(got.ScriptPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read imported script: %v", err)
|
||||
}
|
||||
if string(data) != "print('crawler')\n" {
|
||||
if got.Name != "Demo Crawler" {
|
||||
t.Fatalf("name = %q, want script metadata name", got.Name)
|
||||
}
|
||||
if string(data) != script {
|
||||
t.Fatalf("script content = %q", string(data))
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleImportCrawlerScriptFileRejectsMissingName(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
var body bytes.Buffer
|
||||
mw := multipart.NewWriter(&body)
|
||||
part, err := mw.CreateFormFile("file", "crawler.py")
|
||||
if err != nil {
|
||||
t.Fatalf("create form file: %v", err)
|
||||
}
|
||||
if _, err := part.Write([]byte("print('crawler')\n")); err != nil {
|
||||
t.Fatalf("write part: %v", err)
|
||||
}
|
||||
if err := mw.Close(); err != nil {
|
||||
t.Fatalf("close multipart: %v", err)
|
||||
}
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/import-file", &body)
|
||||
req.Header.Set("Content-Type", mw.FormDataContentType())
|
||||
rr := httptest.NewRecorder()
|
||||
(&AdminServer{LocalPreviewDir: filepath.Join(tmp, "previews")}).handleImportCrawlerScriptFile(rr, req)
|
||||
if rr.Code != http.StatusBadRequest {
|
||||
t.Fatalf("status = %d, want 400; body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
if !strings.Contains(rr.Body.String(), "CRAWLER_NAME") {
|
||||
t.Fatalf("body = %s, want CRAWLER_NAME error", rr.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleImportCrawlerScriptFileRejectsNonPython(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
var body bytes.Buffer
|
||||
@@ -1088,7 +1221,7 @@ func TestHandleImportCrawlerScriptURL(t *testing.T) {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
_, _ = w.Write([]byte("# crawler from url\n"))
|
||||
_, _ = w.Write([]byte("CRAWLER_NAME = \"URL Crawler\"\n# crawler from url\n"))
|
||||
}))
|
||||
defer upstream.Close()
|
||||
|
||||
@@ -1102,6 +1235,7 @@ func TestHandleImportCrawlerScriptURL(t *testing.T) {
|
||||
}
|
||||
var got struct {
|
||||
ScriptPath string `json:"scriptPath"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
@@ -1114,11 +1248,116 @@ func TestHandleImportCrawlerScriptURL(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("read imported script: %v", err)
|
||||
}
|
||||
if string(data) != "# crawler from url\n" {
|
||||
if got.Name != "URL Crawler" {
|
||||
t.Fatalf("name = %q, want script metadata name", got.Name)
|
||||
}
|
||||
if filepath.Base(got.ScriptPath) != "crawler.py" {
|
||||
t.Fatalf("script filename = %q, want original filename", filepath.Base(got.ScriptPath))
|
||||
}
|
||||
if string(data) != "CRAWLER_NAME = \"URL Crawler\"\n# crawler from url\n" {
|
||||
t.Fatalf("script content = %q", string(data))
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleDeleteCrawlerRemovesImportedScript(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
tmp := t.TempDir()
|
||||
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open catalog: %v", err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
if err := cat.Close(); err != nil {
|
||||
t.Fatalf("close catalog: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
scriptDir := filepath.Join(tmp, "crawler-scripts")
|
||||
if err := os.MkdirAll(scriptDir, 0o755); err != nil {
|
||||
t.Fatalf("mkdir script dir: %v", err)
|
||||
}
|
||||
scriptPath := filepath.Join(scriptDir, "crawler.py")
|
||||
if err := os.WriteFile(scriptPath, []byte("CRAWLER_NAME = \"Delete Me\"\n"), 0o644); err != nil {
|
||||
t.Fatalf("write script: %v", err)
|
||||
}
|
||||
if err := cat.UpsertDrive(ctx, &catalog.Drive{
|
||||
ID: "crawler-delete-me",
|
||||
Kind: scriptcrawler.Kind,
|
||||
Name: "Delete Me",
|
||||
RootID: "/",
|
||||
Credentials: map[string]string{
|
||||
"script_path": scriptPath,
|
||||
"proxy": "http://127.0.0.1:7890",
|
||||
"target_new": "10",
|
||||
},
|
||||
}); err != nil {
|
||||
t.Fatalf("seed crawler: %v", err)
|
||||
}
|
||||
now := time.Now()
|
||||
if err := cat.UpsertVideo(ctx, &catalog.Video{
|
||||
ID: "video-from-crawler",
|
||||
DriveID: "crawler-delete-me",
|
||||
FileID: "video.mp4",
|
||||
Title: "Keep Me",
|
||||
PublishedAt: now,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}); err != nil {
|
||||
t.Fatalf("seed video: %v", err)
|
||||
}
|
||||
|
||||
req := httptest.NewRequest(http.MethodDelete, "/admin/api/crawlers/crawler-delete-me", nil)
|
||||
rctx := chi.NewRouteContext()
|
||||
rctx.URLParams.Add("id", "crawler-delete-me")
|
||||
req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
|
||||
rr := httptest.NewRecorder()
|
||||
|
||||
stopped := false
|
||||
(&AdminServer{
|
||||
Catalog: cat,
|
||||
LocalPreviewDir: filepath.Join(tmp, "previews"),
|
||||
OnDriveDeleteCleanup: func(context.Context, string) (int, error) {
|
||||
t.Fatal("crawler delete must not delete imported videos")
|
||||
return 0, nil
|
||||
},
|
||||
OnStopDriveTasks: func(driveID string) bool {
|
||||
stopped = driveID == "crawler-delete-me"
|
||||
return true
|
||||
},
|
||||
}).handleDeleteCrawler(rr, req)
|
||||
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
if _, err := os.Stat(scriptPath); !errors.Is(err, os.ErrNotExist) {
|
||||
t.Fatalf("script stat error = %v, want not exist", err)
|
||||
}
|
||||
if !stopped {
|
||||
t.Fatal("stop hook was not called")
|
||||
}
|
||||
drive, err := cat.GetDrive(ctx, "crawler-delete-me")
|
||||
if err != nil {
|
||||
t.Fatalf("crawler drive should remain for existing videos: %v", err)
|
||||
}
|
||||
if drive.Credentials["script_path"] != "" || drive.Credentials["proxy"] != "" || drive.Credentials["target_new"] != "" {
|
||||
t.Fatalf("crawler credentials were not cleared: %+v", drive.Credentials)
|
||||
}
|
||||
if _, err := cat.GetVideo(ctx, "video-from-crawler"); err != nil {
|
||||
t.Fatalf("imported video should remain: %v", err)
|
||||
}
|
||||
var got struct {
|
||||
OK bool `json:"ok"`
|
||||
DeletedVideos int `json:"deletedVideos"`
|
||||
DeletedScript bool `json:"deletedScript"`
|
||||
}
|
||||
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if !got.OK || got.DeletedVideos != 0 || !got.DeletedScript {
|
||||
t.Fatalf("response = %#v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleImportCrawlerScriptURLRejectsNonPython(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -1143,6 +1382,81 @@ func TestHandleImportCrawlerScriptURLRejectsNonPython(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleTestCrawlerScriptRunsImportedScript(t *testing.T) {
|
||||
if _, err := exec.LookPath("python3"); err != nil {
|
||||
t.Skip("python3 is required for crawler script dry-run")
|
||||
}
|
||||
media := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/video.mp4" {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "video/mp4")
|
||||
if r.Header.Get("Range") == "bytes=0-0" {
|
||||
w.Header().Set("Content-Range", "bytes 0-0/2048")
|
||||
w.WriteHeader(http.StatusPartialContent)
|
||||
_, _ = w.Write([]byte{0})
|
||||
return
|
||||
}
|
||||
_, _ = w.Write([]byte("video"))
|
||||
}))
|
||||
defer media.Close()
|
||||
|
||||
script := filepath.Join(t.TempDir(), "crawler.py")
|
||||
body := `import json
|
||||
print(json.dumps({"title": "Dry Run Video", "source_id": "dry-1", "media_url": "` + media.URL + `/video.mp4", "thumbnail_url": "` + media.URL + `/thumb.jpg", "detail_url": "` + media.URL + `/detail"}))
|
||||
`
|
||||
if err := os.WriteFile(script, []byte(body), 0o755); err != nil {
|
||||
t.Fatalf("write script: %v", err)
|
||||
}
|
||||
|
||||
reqBody, err := json.Marshal(map[string]string{
|
||||
"scriptPath": script,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal request: %v", err)
|
||||
}
|
||||
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/test-script", bytes.NewReader(reqBody))
|
||||
rr := httptest.NewRecorder()
|
||||
(&AdminServer{}).handleTestCrawlerScript(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
|
||||
var got struct {
|
||||
OK bool `json:"ok"`
|
||||
Items []struct {
|
||||
Title string `json:"title"`
|
||||
SourceID string `json:"sourceId"`
|
||||
MediaURL string `json:"mediaUrl"`
|
||||
} `json:"items"`
|
||||
MediaCheck *struct {
|
||||
OK bool `json:"ok"`
|
||||
Status int `json:"status"`
|
||||
ContentType string `json:"contentType"`
|
||||
ContentLength int64 `json:"contentLengthBytes"`
|
||||
} `json:"mediaCheck"`
|
||||
}
|
||||
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if !got.OK {
|
||||
t.Fatalf("ok = false, body = %s", rr.Body.String())
|
||||
}
|
||||
if len(got.Items) != 1 || got.Items[0].Title != "Dry Run Video" || got.Items[0].SourceID != "dry-1" {
|
||||
t.Fatalf("items = %#v", got.Items)
|
||||
}
|
||||
if got.Items[0].MediaURL != media.URL+"/video.mp4" {
|
||||
t.Fatalf("mediaUrl = %q", got.Items[0].MediaURL)
|
||||
}
|
||||
if got.MediaCheck == nil || !got.MediaCheck.OK || got.MediaCheck.Status != http.StatusPartialContent {
|
||||
t.Fatalf("mediaCheck = %#v", got.MediaCheck)
|
||||
}
|
||||
if got.MediaCheck.ContentLength != 2048 {
|
||||
t.Fatalf("contentLength = %d, want 2048", got.MediaCheck.ContentLength)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleListDrivesIncludesGoogleDriveOnlineAPIMode(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
|
||||
|
||||
@@ -273,12 +273,16 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err
|
||||
}
|
||||
emit(CrawlProgress{})
|
||||
|
||||
if err := os.MkdirAll(c.cfg.Driver.CrawlDir(), 0o755); err != nil {
|
||||
crawlDir, err := filepath.Abs(c.cfg.Driver.CrawlDir())
|
||||
if err != nil {
|
||||
return result, fmt.Errorf("scriptcrawler: resolve crawl dir: %w", err)
|
||||
}
|
||||
if err := os.MkdirAll(crawlDir, 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
runID := time.Now().UTC().Format("20060102T150405Z")
|
||||
seenPath := filepath.Join(c.cfg.Driver.CrawlDir(), "seen-"+runID+".txt")
|
||||
jobPath := filepath.Join(c.cfg.Driver.CrawlDir(), "job-"+runID+".json")
|
||||
seenPath := filepath.Join(crawlDir, "seen-"+runID+".txt")
|
||||
jobPath := filepath.Join(crawlDir, "job-"+runID+".json")
|
||||
result.SeenFile = seenPath
|
||||
result.JobFile = jobPath
|
||||
|
||||
@@ -412,6 +416,10 @@ func (c *Crawler) writeJobFile(path, runID string, targetNew int, seenPath strin
|
||||
}
|
||||
cfg = json.RawMessage(raw)
|
||||
}
|
||||
outputDir, err := filepath.Abs(c.cfg.Driver.OutputDir())
|
||||
if err != nil {
|
||||
return fmt.Errorf("resolve output dir: %w", err)
|
||||
}
|
||||
job := Job{
|
||||
Protocol: "crawler.v1",
|
||||
Mode: "crawl",
|
||||
@@ -419,7 +427,7 @@ func (c *Crawler) writeJobFile(path, runID string, targetNew int, seenPath strin
|
||||
CrawlerID: c.cfg.Driver.ID(),
|
||||
TargetNew: targetNew,
|
||||
SeenSourceIDsFile: seenPath,
|
||||
OutputDir: c.cfg.Driver.OutputDir(),
|
||||
OutputDir: outputDir,
|
||||
Config: cfg,
|
||||
Network: JobNetwork{ProxyURL: strings.TrimSpace(c.cfg.ProxyURL)},
|
||||
}
|
||||
|
||||
@@ -135,6 +135,58 @@ func TestCrawlerRunOnceUsesSourceKindNamespace(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawlerRunOncePassesAbsoluteJobPathsWhenWorkDirDiffers(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
tmp := t.TempDir()
|
||||
t.Chdir(tmp)
|
||||
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open catalog: %v", err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
if err := cat.Close(); err != nil {
|
||||
t.Fatalf("close catalog: %v", err)
|
||||
}
|
||||
})
|
||||
drv := New(Config{ID: "demo", RootDir: filepath.Join("data", "crawler")})
|
||||
if err := drv.Init(ctx); err != nil {
|
||||
t.Fatalf("driver init: %v", err)
|
||||
}
|
||||
scriptDir := filepath.Join(tmp, "scripts")
|
||||
if err := os.MkdirAll(scriptDir, 0o755); err != nil {
|
||||
t.Fatalf("mkdir script dir: %v", err)
|
||||
}
|
||||
dummyScript := filepath.Join(scriptDir, "helper-script")
|
||||
if err := os.WriteFile(dummyScript, []byte("helper"), 0o755); err != nil {
|
||||
t.Fatalf("write dummy script: %v", err)
|
||||
}
|
||||
wrapper := filepath.Join(tmp, "helper-wrapper.sh")
|
||||
wrapperScript := fmt.Sprintf("#!/bin/sh\nexec %q -test.run=TestScriptCrawlerHelperProcess \"$@\"\n", os.Args[0])
|
||||
if err := os.WriteFile(wrapper, []byte(wrapperScript), 0o755); err != nil {
|
||||
t.Fatalf("write helper wrapper: %v", err)
|
||||
}
|
||||
|
||||
t.Setenv("GO_WANT_SCRIPTCRAWLER_HELPER", "1")
|
||||
t.Setenv("GO_WANT_SCRIPTCRAWLER_ASSERT_ABS", "1")
|
||||
c := NewCrawler(CrawlerConfig{
|
||||
Driver: drv,
|
||||
Catalog: cat,
|
||||
PythonPath: wrapper,
|
||||
ScriptPath: dummyScript,
|
||||
WorkDir: scriptDir,
|
||||
})
|
||||
res, err := c.RunOnce(ctx, 1)
|
||||
if err != nil {
|
||||
t.Fatalf("run once: %v", err)
|
||||
}
|
||||
if res.NewVideos != 1 || res.Skipped != 0 || res.Failed != 0 {
|
||||
t.Fatalf("result = new:%d skipped:%d failed:%d, want 1/0/0", res.NewVideos, res.Skipped, res.Failed)
|
||||
}
|
||||
if !filepath.IsAbs(res.JobFile) || !filepath.IsAbs(res.SeenFile) {
|
||||
t.Fatalf("result paths should be absolute: job=%q seen=%q", res.JobFile, res.SeenFile)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawlerRunOnceImportsSimpleMediaURLWithoutSourceID(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
tmp := t.TempDir()
|
||||
@@ -241,6 +293,12 @@ func TestScriptCrawlerHelperProcess(t *testing.T) {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
os.Exit(2)
|
||||
}
|
||||
if os.Getenv("GO_WANT_SCRIPTCRAWLER_ASSERT_ABS") == "1" {
|
||||
if !filepath.IsAbs(jobPath) || !filepath.IsAbs(job.SeenSourceIDsFile) || !filepath.IsAbs(job.OutputDir) {
|
||||
fmt.Fprintf(os.Stderr, "expected absolute paths, got job=%q seen=%q output=%q\n", jobPath, job.SeenSourceIDsFile, job.OutputDir)
|
||||
os.Exit(2)
|
||||
}
|
||||
}
|
||||
if os.Getenv("GO_WANT_SCRIPTCRAWLER_SIMPLE") == "1" {
|
||||
event := map[string]any{
|
||||
"title": "Simple Protocol Video",
|
||||
|
||||
@@ -0,0 +1,375 @@
|
||||
package scriptcrawler
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// DryRun 在不入库的前提下试跑一个爬虫脚本:临时目录里生成 job.json,
|
||||
// 启动脚本进程,拿到第一条(或前 MaxItems 条)item 事件后立即停止,
|
||||
// 再对视频直链做一次小范围探测,验证脚本"能不能爬取到视频"。
|
||||
// 用于后台导入脚本后的"测试脚本"按钮。
|
||||
|
||||
const (
|
||||
defaultDryRunTimeout = 2 * time.Minute
|
||||
dryRunLogTailLines = 60
|
||||
dryRunMediaProbeLimit = 20 * time.Second
|
||||
)
|
||||
|
||||
type DryRunConfig struct {
|
||||
PythonPath string
|
||||
ScriptPath string
|
||||
ProxyURL string
|
||||
ConfigJSON string
|
||||
// MaxItems 收到多少条 item 后停止脚本,默认 1。
|
||||
MaxItems int
|
||||
// Timeout 整个试跑的硬上限,默认 2 分钟。
|
||||
Timeout time.Duration
|
||||
// SkipMediaProbe 跳过视频直链可达性探测(单测注入用)。
|
||||
SkipMediaProbe bool
|
||||
HTTPClient *http.Client
|
||||
}
|
||||
|
||||
type DryRunItem struct {
|
||||
Title string `json:"title"`
|
||||
SourceID string `json:"sourceId,omitempty"`
|
||||
MediaURL string `json:"mediaUrl,omitempty"`
|
||||
MediaLocalFile string `json:"mediaLocalFile,omitempty"`
|
||||
ThumbnailURL string `json:"thumbnailUrl,omitempty"`
|
||||
DetailURL string `json:"detailUrl,omitempty"`
|
||||
}
|
||||
|
||||
type DryRunMediaCheck struct {
|
||||
OK bool `json:"ok"`
|
||||
Status int `json:"status,omitempty"`
|
||||
ContentType string `json:"contentType,omitempty"`
|
||||
ContentLength int64 `json:"contentLengthBytes,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type DryRunResult struct {
|
||||
OK bool `json:"ok"`
|
||||
Items []DryRunItem `json:"items"`
|
||||
MediaCheck *DryRunMediaCheck `json:"mediaCheck,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
Log []string `json:"log,omitempty"`
|
||||
DurationMs int64 `json:"durationMs"`
|
||||
}
|
||||
|
||||
func DryRun(ctx context.Context, cfg DryRunConfig) *DryRunResult {
|
||||
started := time.Now()
|
||||
result := &DryRunResult{Items: []DryRunItem{}}
|
||||
defer func() { result.DurationMs = time.Since(started).Milliseconds() }()
|
||||
|
||||
scriptPath := strings.TrimSpace(cfg.ScriptPath)
|
||||
if scriptPath == "" {
|
||||
result.Error = "脚本路径为空,请先导入脚本"
|
||||
return result
|
||||
}
|
||||
if _, err := os.Stat(scriptPath); err != nil {
|
||||
result.Error = fmt.Sprintf("脚本不存在: %v", err)
|
||||
return result
|
||||
}
|
||||
pythonPath := strings.TrimSpace(cfg.PythonPath)
|
||||
if pythonPath == "" {
|
||||
pythonPath = "python3"
|
||||
}
|
||||
maxItems := cfg.MaxItems
|
||||
if maxItems <= 0 {
|
||||
maxItems = 1
|
||||
}
|
||||
timeout := cfg.Timeout
|
||||
if timeout <= 0 {
|
||||
timeout = defaultDryRunTimeout
|
||||
}
|
||||
|
||||
tmpDir, err := os.MkdirTemp("", "crawler-dryrun-")
|
||||
if err != nil {
|
||||
result.Error = fmt.Sprintf("创建临时目录失败: %v", err)
|
||||
return result
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
outputDir := filepath.Join(tmpDir, "output")
|
||||
if err := os.MkdirAll(outputDir, 0o755); err != nil {
|
||||
result.Error = fmt.Sprintf("创建输出目录失败: %v", err)
|
||||
return result
|
||||
}
|
||||
seenPath := filepath.Join(tmpDir, "seen.txt")
|
||||
if err := os.WriteFile(seenPath, nil, 0o644); err != nil {
|
||||
result.Error = fmt.Sprintf("写入 seen 文件失败: %v", err)
|
||||
return result
|
||||
}
|
||||
|
||||
configJSON := json.RawMessage([]byte("{}"))
|
||||
if raw := strings.TrimSpace(cfg.ConfigJSON); raw != "" {
|
||||
if !json.Valid([]byte(raw)) {
|
||||
result.Error = "自定义配置必须是合法 JSON"
|
||||
return result
|
||||
}
|
||||
configJSON = json.RawMessage(raw)
|
||||
}
|
||||
job := Job{
|
||||
Protocol: "crawler.v1",
|
||||
Mode: "crawl",
|
||||
RunID: "dryrun-" + started.UTC().Format("20060102T150405Z"),
|
||||
CrawlerID: "dryrun",
|
||||
TargetNew: maxItems,
|
||||
SeenSourceIDsFile: seenPath,
|
||||
OutputDir: outputDir,
|
||||
Config: configJSON,
|
||||
Network: JobNetwork{ProxyURL: strings.TrimSpace(cfg.ProxyURL)},
|
||||
}
|
||||
jobPath := filepath.Join(tmpDir, "job.json")
|
||||
jobData, err := json.MarshalIndent(job, "", " ")
|
||||
if err != nil {
|
||||
result.Error = fmt.Sprintf("生成 job 文件失败: %v", err)
|
||||
return result
|
||||
}
|
||||
if err := os.WriteFile(jobPath, jobData, 0o600); err != nil {
|
||||
result.Error = fmt.Sprintf("写入 job 文件失败: %v", err)
|
||||
return result
|
||||
}
|
||||
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(runCtx, pythonPath, scriptPath, "--job", jobPath)
|
||||
cmd.Dir = filepath.Dir(scriptPath)
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Cancel = func() error {
|
||||
return killDryRunProcess(cmd)
|
||||
}
|
||||
// 超时或提前 kill 后,脚本派生的子进程可能仍持有 stdout/stderr 管道;
|
||||
// WaitDelay 强制在宽限期后关闭管道,避免读取端永久阻塞。
|
||||
cmd.WaitDelay = 3 * time.Second
|
||||
if proxyURL := strings.TrimSpace(cfg.ProxyURL); proxyURL != "" {
|
||||
cmd.Env = append(os.Environ(),
|
||||
"HTTP_PROXY="+proxyURL,
|
||||
"HTTPS_PROXY="+proxyURL,
|
||||
"http_proxy="+proxyURL,
|
||||
"https_proxy="+proxyURL,
|
||||
"NO_PROXY=",
|
||||
"no_proxy=",
|
||||
)
|
||||
}
|
||||
stdout, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
result.Error = fmt.Sprintf("启动脚本失败: %v", err)
|
||||
return result
|
||||
}
|
||||
stderr, err := cmd.StderrPipe()
|
||||
if err != nil {
|
||||
_ = stdout.Close()
|
||||
result.Error = fmt.Sprintf("启动脚本失败: %v", err)
|
||||
return result
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
_ = stdout.Close()
|
||||
_ = stderr.Close()
|
||||
result.Error = fmt.Sprintf("启动脚本失败: %v", err)
|
||||
return result
|
||||
}
|
||||
|
||||
// stderr 是脚本日志,保留尾部若干行用于排错回显。
|
||||
var logMu sync.Mutex
|
||||
logTail := make([]string, 0, dryRunLogTailLines)
|
||||
stderrDone := make(chan struct{})
|
||||
go func() {
|
||||
defer close(stderrDone)
|
||||
scanner := bufio.NewScanner(stderr)
|
||||
scanner.Buffer(make([]byte, 64*1024), 1024*1024)
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
logMu.Lock()
|
||||
if len(logTail) >= dryRunLogTailLines {
|
||||
logTail = logTail[1:]
|
||||
}
|
||||
logTail = append(logTail, line)
|
||||
logMu.Unlock()
|
||||
}
|
||||
}()
|
||||
|
||||
items := []DryRunItem{}
|
||||
var firstMediaHeaders map[string]string
|
||||
parseFailures := 0
|
||||
scanner := bufio.NewScanner(stdout)
|
||||
scanner.Buffer(make([]byte, 64*1024), 4*1024*1024)
|
||||
for scanner.Scan() {
|
||||
if runCtx.Err() != nil {
|
||||
break
|
||||
}
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
var event Event
|
||||
if err := json.Unmarshal([]byte(line), &event); err != nil {
|
||||
parseFailures++
|
||||
continue
|
||||
}
|
||||
eventType := strings.ToLower(strings.TrimSpace(event.Type))
|
||||
item := event.normalizedItem()
|
||||
if eventType == "" && item.hasPayload() {
|
||||
eventType = "item"
|
||||
}
|
||||
if eventType != "item" {
|
||||
continue
|
||||
}
|
||||
normalized, _, err := normalizeItemForImport(item)
|
||||
if err != nil {
|
||||
result.Error = fmt.Sprintf("item 字段不完整: %v", err)
|
||||
continue
|
||||
}
|
||||
mediaURL := strings.TrimSpace(normalized.Media.URL)
|
||||
if len(items) == 0 {
|
||||
firstMediaHeaders = normalized.Media.Headers
|
||||
}
|
||||
items = append(items, DryRunItem{
|
||||
Title: strings.TrimSpace(normalized.Title),
|
||||
SourceID: strings.TrimSpace(item.SourceID),
|
||||
MediaURL: mediaURL,
|
||||
MediaLocalFile: strings.TrimSpace(normalized.Media.LocalFile),
|
||||
ThumbnailURL: strings.TrimSpace(normalized.Thumbnail.URL),
|
||||
DetailURL: strings.TrimSpace(normalized.DetailURL),
|
||||
})
|
||||
if len(items) >= maxItems {
|
||||
break
|
||||
}
|
||||
}
|
||||
// 拿够了就停掉脚本,避免它继续翻页。
|
||||
_ = killDryRunProcess(cmd)
|
||||
_ = cmd.Wait()
|
||||
<-stderrDone
|
||||
|
||||
logMu.Lock()
|
||||
result.Log = append([]string{}, logTail...)
|
||||
logMu.Unlock()
|
||||
result.Items = items
|
||||
|
||||
if len(items) == 0 {
|
||||
if result.Error == "" {
|
||||
switch {
|
||||
case runCtx.Err() != nil && ctx.Err() == nil:
|
||||
result.Error = fmt.Sprintf("测试超时(%s),脚本没有输出任何视频", timeout)
|
||||
case parseFailures > 0:
|
||||
result.Error = "脚本 stdout 不是合法的 crawler.v1 JSON Lines(日志应输出到 stderr)"
|
||||
default:
|
||||
result.Error = "脚本退出但没有输出任何视频"
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
result.Error = ""
|
||||
|
||||
first := items[0]
|
||||
switch {
|
||||
case cfg.SkipMediaProbe:
|
||||
result.OK = true
|
||||
case first.MediaLocalFile != "":
|
||||
// 脚本自己下载到 output_dir 的模式:试跑用的是临时目录,
|
||||
// 文件已随目录清理,能输出合法 local_file 即视为通过。
|
||||
result.OK = true
|
||||
default:
|
||||
check := probeMediaURL(ctx, cfg, first, firstMediaHeaders)
|
||||
result.MediaCheck = check
|
||||
result.OK = check.OK
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func killDryRunProcess(cmd *exec.Cmd) error {
|
||||
if cmd == nil || cmd.Process == nil {
|
||||
return nil
|
||||
}
|
||||
if err := syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL); err != nil {
|
||||
if err == syscall.ESRCH {
|
||||
return nil
|
||||
}
|
||||
return cmd.Process.Kill()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// probeMediaURL 对视频直链发一个 Range: bytes=0-0 的小请求,
|
||||
// 验证直链可达(带上脚本给的防盗链 headers 和代理)。
|
||||
func probeMediaURL(ctx context.Context, cfg DryRunConfig, item DryRunItem, mediaHeaders map[string]string) *DryRunMediaCheck {
|
||||
check := &DryRunMediaCheck{}
|
||||
if item.MediaURL == "" {
|
||||
check.Error = "item 没有视频直链"
|
||||
return check
|
||||
}
|
||||
|
||||
client := cfg.HTTPClient
|
||||
if client == nil {
|
||||
transport := &http.Transport{
|
||||
Proxy: http.ProxyFromEnvironment,
|
||||
ResponseHeaderTimeout: dryRunMediaProbeLimit,
|
||||
}
|
||||
if err := configureExplicitProxy(transport, cfg.ProxyURL); err != nil {
|
||||
check.Error = fmt.Sprintf("代理配置无效: %v", err)
|
||||
return check
|
||||
}
|
||||
client = &http.Client{Transport: transport}
|
||||
}
|
||||
|
||||
probeCtx, cancel := context.WithTimeout(ctx, dryRunMediaProbeLimit)
|
||||
defer cancel()
|
||||
req, err := http.NewRequestWithContext(probeCtx, http.MethodGet, item.MediaURL, nil)
|
||||
if err != nil {
|
||||
check.Error = fmt.Sprintf("视频直链无效: %v", err)
|
||||
return check
|
||||
}
|
||||
req.Header.Set("User-Agent", defaultUserAgent)
|
||||
req.Header.Set("Range", "bytes=0-0")
|
||||
if item.DetailURL != "" {
|
||||
req.Header.Set("Referer", item.DetailURL)
|
||||
}
|
||||
for k, v := range mediaHeaders {
|
||||
k = strings.TrimSpace(k)
|
||||
if k == "" {
|
||||
continue
|
||||
}
|
||||
req.Header.Set(k, v)
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
check.Error = fmt.Sprintf("视频直链请求失败: %v", err)
|
||||
return check
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
check.Status = resp.StatusCode
|
||||
check.ContentType = resp.Header.Get("Content-Type")
|
||||
if cr := resp.Header.Get("Content-Range"); cr != "" {
|
||||
// Content-Range: bytes 0-0/12345 → 取总大小
|
||||
if idx := strings.LastIndex(cr, "/"); idx >= 0 {
|
||||
var total int64
|
||||
if _, err := fmt.Sscanf(cr[idx+1:], "%d", &total); err == nil {
|
||||
check.ContentLength = total
|
||||
}
|
||||
}
|
||||
}
|
||||
if check.ContentLength == 0 && resp.StatusCode == http.StatusOK {
|
||||
check.ContentLength = resp.ContentLength
|
||||
}
|
||||
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusPartialContent {
|
||||
check.Error = fmt.Sprintf("视频直链返回 HTTP %d", resp.StatusCode)
|
||||
return check
|
||||
}
|
||||
check.OK = true
|
||||
return check
|
||||
}
|
||||
@@ -0,0 +1,153 @@
|
||||
package scriptcrawler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func writeDryRunScript(t *testing.T, body string) string {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "crawler.sh")
|
||||
if err := os.WriteFile(path, []byte("#!/bin/sh\n"+body), 0o755); err != nil {
|
||||
t.Fatalf("write script: %v", err)
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
func TestDryRunCollectsFirstItem(t *testing.T) {
|
||||
script := writeDryRunScript(t, `
|
||||
echo '[log] fetching list page' >&2
|
||||
echo '{"type":"item","item":{"title":"Test Video","media_url":"https://cdn.example.test/v.mp4","source_id":"123","thumbnail_url":"https://cdn.example.test/t.jpg"}}'
|
||||
echo '{"type":"done","stats":{"emitted":1}}'
|
||||
`)
|
||||
result := DryRun(context.Background(), DryRunConfig{
|
||||
PythonPath: "/bin/sh",
|
||||
ScriptPath: script,
|
||||
SkipMediaProbe: true,
|
||||
})
|
||||
if !result.OK {
|
||||
t.Fatalf("ok = false, error = %q, log = %v", result.Error, result.Log)
|
||||
}
|
||||
if len(result.Items) != 1 {
|
||||
t.Fatalf("items = %d, want 1", len(result.Items))
|
||||
}
|
||||
item := result.Items[0]
|
||||
if item.Title != "Test Video" || item.MediaURL != "https://cdn.example.test/v.mp4" || item.SourceID != "123" {
|
||||
t.Fatalf("item = %+v", item)
|
||||
}
|
||||
if len(result.Log) == 0 || !strings.Contains(result.Log[0], "fetching list page") {
|
||||
t.Fatalf("log tail = %v, want stderr captured", result.Log)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDryRunProbesMediaURL(t *testing.T) {
|
||||
var gotRange, gotReferer string
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
gotRange = r.Header.Get("Range")
|
||||
gotReferer = r.Header.Get("Referer")
|
||||
w.Header().Set("Content-Type", "video/mp4")
|
||||
w.Header().Set("Content-Range", "bytes 0-0/4096")
|
||||
w.WriteHeader(http.StatusPartialContent)
|
||||
_, _ = w.Write([]byte("x"))
|
||||
}))
|
||||
t.Cleanup(srv.Close)
|
||||
|
||||
script := writeDryRunScript(t, fmt.Sprintf(
|
||||
`echo '{"type":"item","title":"Probe Video","media_url":"%s/v.mp4","detail_url":"https://example.test/view"}'`,
|
||||
srv.URL,
|
||||
))
|
||||
result := DryRun(context.Background(), DryRunConfig{
|
||||
PythonPath: "/bin/sh",
|
||||
ScriptPath: script,
|
||||
})
|
||||
if !result.OK {
|
||||
t.Fatalf("ok = false, error = %q, mediaCheck = %+v", result.Error, result.MediaCheck)
|
||||
}
|
||||
if result.MediaCheck == nil || !result.MediaCheck.OK {
|
||||
t.Fatalf("mediaCheck = %+v, want ok", result.MediaCheck)
|
||||
}
|
||||
if result.MediaCheck.Status != http.StatusPartialContent || result.MediaCheck.ContentLength != 4096 {
|
||||
t.Fatalf("mediaCheck = %+v, want 206 with total 4096", result.MediaCheck)
|
||||
}
|
||||
if gotRange != "bytes=0-0" || gotReferer != "https://example.test/view" {
|
||||
t.Fatalf("probe headers range=%q referer=%q", gotRange, gotReferer)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDryRunReportsBrokenMediaURL(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "forbidden", http.StatusForbidden)
|
||||
}))
|
||||
t.Cleanup(srv.Close)
|
||||
|
||||
script := writeDryRunScript(t, fmt.Sprintf(
|
||||
`echo '{"type":"item","title":"Dead Link","media_url":"%s/v.mp4"}'`,
|
||||
srv.URL,
|
||||
))
|
||||
result := DryRun(context.Background(), DryRunConfig{
|
||||
PythonPath: "/bin/sh",
|
||||
ScriptPath: script,
|
||||
})
|
||||
if result.OK {
|
||||
t.Fatal("ok = true, want false for HTTP 403 media url")
|
||||
}
|
||||
if result.MediaCheck == nil || result.MediaCheck.OK || result.MediaCheck.Status != http.StatusForbidden {
|
||||
t.Fatalf("mediaCheck = %+v, want failed 403", result.MediaCheck)
|
||||
}
|
||||
if len(result.Items) != 1 {
|
||||
t.Fatalf("items = %d, want item still returned for debugging", len(result.Items))
|
||||
}
|
||||
}
|
||||
|
||||
func TestDryRunRejectsNonJSONStdout(t *testing.T) {
|
||||
script := writeDryRunScript(t, `echo 'plain text progress output'`)
|
||||
result := DryRun(context.Background(), DryRunConfig{
|
||||
PythonPath: "/bin/sh",
|
||||
ScriptPath: script,
|
||||
SkipMediaProbe: true,
|
||||
})
|
||||
if result.OK {
|
||||
t.Fatal("ok = true, want false for non-JSON stdout")
|
||||
}
|
||||
if !strings.Contains(result.Error, "JSON Lines") {
|
||||
t.Fatalf("error = %q, want JSON Lines hint", result.Error)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDryRunTimesOut(t *testing.T) {
|
||||
script := writeDryRunScript(t, `sleep 30`)
|
||||
start := time.Now()
|
||||
result := DryRun(context.Background(), DryRunConfig{
|
||||
PythonPath: "/bin/sh",
|
||||
ScriptPath: script,
|
||||
Timeout: 2 * time.Second,
|
||||
SkipMediaProbe: true,
|
||||
})
|
||||
if result.OK {
|
||||
t.Fatal("ok = true, want false on timeout")
|
||||
}
|
||||
if !strings.Contains(result.Error, "超时") {
|
||||
t.Fatalf("error = %q, want timeout message", result.Error)
|
||||
}
|
||||
if elapsed := time.Since(start); elapsed > 10*time.Second {
|
||||
t.Fatalf("dry run took %s, script was not killed", elapsed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDryRunMissingScript(t *testing.T) {
|
||||
result := DryRun(context.Background(), DryRunConfig{
|
||||
PythonPath: "/bin/sh",
|
||||
ScriptPath: filepath.Join(t.TempDir(), "missing.py"),
|
||||
})
|
||||
if result.OK || result.Error == "" {
|
||||
t.Fatalf("result = %+v, want error for missing script", result)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
package scriptcrawler
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const maxCrawlerNameRunes = 80
|
||||
|
||||
type Metadata struct {
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
func ReadMetadata(scriptPath string) (Metadata, error) {
|
||||
scriptPath = strings.TrimSpace(scriptPath)
|
||||
if scriptPath == "" {
|
||||
return Metadata{}, errors.New("脚本路径为空")
|
||||
}
|
||||
if filepath.Ext(scriptPath) != ".py" {
|
||||
return Metadata{}, errors.New("目前只支持 .py 爬虫脚本")
|
||||
}
|
||||
data, err := os.ReadFile(scriptPath)
|
||||
if err != nil {
|
||||
return Metadata{}, fmt.Errorf("读取脚本失败: %w", err)
|
||||
}
|
||||
return ExtractMetadata(string(data))
|
||||
}
|
||||
|
||||
func ExtractMetadata(source string) (Metadata, error) {
|
||||
for _, line := range strings.Split(source, "\n") {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if trimmed == "" || strings.HasPrefix(trimmed, "#") {
|
||||
continue
|
||||
}
|
||||
if !strings.HasPrefix(trimmed, "CRAWLER_NAME") {
|
||||
continue
|
||||
}
|
||||
left, right, ok := strings.Cut(trimmed, "=")
|
||||
if !ok || strings.TrimSpace(left) != "CRAWLER_NAME" {
|
||||
continue
|
||||
}
|
||||
name, ok := parsePythonStringLiteral(right)
|
||||
if !ok {
|
||||
return Metadata{}, errors.New(`CRAWLER_NAME 必须是字符串字面量,例如 CRAWLER_NAME = "示例爬虫"`)
|
||||
}
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" {
|
||||
return Metadata{}, errors.New("CRAWLER_NAME 不能为空")
|
||||
}
|
||||
if len([]rune(name)) > maxCrawlerNameRunes {
|
||||
return Metadata{}, fmt.Errorf("CRAWLER_NAME 不能超过 %d 个字符", maxCrawlerNameRunes)
|
||||
}
|
||||
return Metadata{Name: name}, nil
|
||||
}
|
||||
return Metadata{}, errors.New(`脚本必须声明 CRAWLER_NAME,例如 CRAWLER_NAME = "示例爬虫"`)
|
||||
}
|
||||
|
||||
func parsePythonStringLiteral(raw string) (string, bool) {
|
||||
s := strings.TrimSpace(raw)
|
||||
if s == "" {
|
||||
return "", false
|
||||
}
|
||||
rawString := false
|
||||
for len(s) > 0 {
|
||||
switch s[0] {
|
||||
case 'r', 'R':
|
||||
rawString = true
|
||||
s = strings.TrimSpace(s[1:])
|
||||
case 'u', 'U', 'b', 'B':
|
||||
s = strings.TrimSpace(s[1:])
|
||||
default:
|
||||
goto parseQuote
|
||||
}
|
||||
}
|
||||
|
||||
parseQuote:
|
||||
if len(s) < 2 || (s[0] != '"' && s[0] != '\'') {
|
||||
return "", false
|
||||
}
|
||||
quote := s[0]
|
||||
var b strings.Builder
|
||||
escaped := false
|
||||
for i := 1; i < len(s); i++ {
|
||||
ch := s[i]
|
||||
if escaped {
|
||||
switch {
|
||||
case rawString:
|
||||
b.WriteByte('\\')
|
||||
b.WriteByte(ch)
|
||||
case ch == 'n':
|
||||
b.WriteByte('\n')
|
||||
case ch == 'r':
|
||||
b.WriteByte('\r')
|
||||
case ch == 't':
|
||||
b.WriteByte('\t')
|
||||
case ch == '\\' || ch == quote || ch == '"' || ch == '\'':
|
||||
b.WriteByte(ch)
|
||||
default:
|
||||
b.WriteByte(ch)
|
||||
}
|
||||
escaped = false
|
||||
continue
|
||||
}
|
||||
if ch == '\\' {
|
||||
escaped = true
|
||||
continue
|
||||
}
|
||||
if ch == quote {
|
||||
return b.String(), true
|
||||
}
|
||||
b.WriteByte(ch)
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
package scriptcrawler
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestExtractMetadataReadsCrawlerName(t *testing.T) {
|
||||
meta, err := ExtractMetadata(`
|
||||
# comment
|
||||
CRAWLER_NAME = "示例爬虫"
|
||||
`)
|
||||
if err != nil {
|
||||
t.Fatalf("extract metadata: %v", err)
|
||||
}
|
||||
if meta.Name != "示例爬虫" {
|
||||
t.Fatalf("name = %q", meta.Name)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractMetadataRejectsMissingCrawlerName(t *testing.T) {
|
||||
_, err := ExtractMetadata(`print("hello")`)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "CRAWLER_NAME") {
|
||||
t.Fatalf("error = %v, want CRAWLER_NAME guidance", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractMetadataRejectsEmptyCrawlerName(t *testing.T) {
|
||||
_, err := ExtractMetadata(`CRAWLER_NAME = " "`)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "不能为空") {
|
||||
t.Fatalf("error = %v, want empty-name error", err)
|
||||
}
|
||||
}
|
||||
@@ -134,9 +134,9 @@ apt_install() {
|
||||
python3 python3-requests python3-bs4 python3-lxml python3-socks
|
||||
}
|
||||
|
||||
verify_spider91_python_deps() {
|
||||
command -v python3 >/dev/null 2>&1 || die "python3 is required for 91Spider"
|
||||
python3 - <<'PY' || die "missing Python modules for 91Spider: requests, bs4, lxml, socks"
|
||||
verify_crawler_python_deps() {
|
||||
command -v python3 >/dev/null 2>&1 || die "python3 is required for crawler scripts"
|
||||
python3 - <<'PY' || die "missing Python modules for crawler scripts: requests, bs4, lxml, socks"
|
||||
import importlib.util
|
||||
import sys
|
||||
|
||||
@@ -200,7 +200,7 @@ install_dependencies() {
|
||||
install_go
|
||||
command -v ffmpeg >/dev/null 2>&1 || die "ffmpeg is required"
|
||||
command -v ffprobe >/dev/null 2>&1 || die "ffprobe is required"
|
||||
verify_spider91_python_deps
|
||||
verify_crawler_python_deps
|
||||
}
|
||||
|
||||
ensure_ownership() {
|
||||
|
||||
@@ -6,6 +6,14 @@ task status and cancellation.
|
||||
|
||||
## Invocation
|
||||
|
||||
Every script must declare a static crawler name near the top of the Python file.
|
||||
The admin page reads this value when importing the script; users do not type the
|
||||
crawler name manually.
|
||||
|
||||
```python
|
||||
CRAWLER_NAME = "Example Crawler"
|
||||
```
|
||||
|
||||
The backend runs:
|
||||
|
||||
```bash
|
||||
@@ -35,12 +43,12 @@ python3 /path/to/crawler.py --job /path/to/job.json
|
||||
## Importing Scripts
|
||||
|
||||
Crawler scripts are configured from the admin crawler page. A script can be
|
||||
entered as an existing server path, uploaded as a local file, or imported from
|
||||
an HTTP(S) URL.
|
||||
uploaded as a local file or imported from an HTTP(S) URL.
|
||||
|
||||
Imported scripts are copied into `crawler-scripts/` next to the configured local
|
||||
preview data directory. The import API currently accepts Python files only
|
||||
(`.py`) and rejects empty files or files larger than 2 MiB.
|
||||
(`.py`) and rejects empty files, files larger than 2 MiB, or scripts without
|
||||
`CRAWLER_NAME`.
|
||||
|
||||
## Output
|
||||
|
||||
|
||||
@@ -3,6 +3,8 @@ import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
CRAWLER_NAME = "Demo Crawler"
|
||||
|
||||
|
||||
def load_seen(path):
|
||||
try:
|
||||
|
||||
+1
-1
@@ -128,7 +128,7 @@ verify_runtime_deps() {
|
||||
command -v "$cmd" >/dev/null 2>&1 || die "missing command: $cmd"
|
||||
done
|
||||
|
||||
python3 - <<'PY' || die "missing Python modules for 91Spider: requests, bs4, lxml, socks"
|
||||
python3 - <<'PY' || die "missing Python modules for crawler scripts: requests, bs4, lxml, socks"
|
||||
import importlib.util
|
||||
import sys
|
||||
|
||||
|
||||
@@ -63,8 +63,6 @@ build_package() {
|
||||
cp "$ROOT_DIR/backend/config.example.yaml" "$work/config.example.yaml"
|
||||
cp "$ROOT_DIR/install.sh" "$work/install.sh"
|
||||
cp -R "$ROOT_DIR/dist" "$work/dist"
|
||||
mkdir -p "$work/91VideoSpider"
|
||||
cp "$ROOT_DIR/91VideoSpider/spider_91porn.py" "$work/91VideoSpider/spider_91porn.py"
|
||||
|
||||
cat >"$work/README.txt" <<EOF
|
||||
$APP_NAME $VERSION
|
||||
|
||||
+345
-96
@@ -1,30 +1,41 @@
|
||||
import { useEffect, useMemo, useState } from "react";
|
||||
import { ArrowLeft, CircleStop, Download, Link as LinkIcon, Plus, Save, Trash2, Upload } from "lucide-react";
|
||||
import { useEffect, useMemo, useState, type ReactNode } from "react";
|
||||
import {
|
||||
Activity,
|
||||
ArrowLeft,
|
||||
ChevronRight,
|
||||
CircleStop,
|
||||
Clock,
|
||||
Download,
|
||||
FileCode2,
|
||||
Gauge,
|
||||
Link as LinkIcon,
|
||||
Plus,
|
||||
RefreshCw,
|
||||
Save,
|
||||
Settings2,
|
||||
TestTube,
|
||||
Trash2,
|
||||
Upload,
|
||||
} from "lucide-react";
|
||||
import * as api from "./api";
|
||||
import { useToast } from "./ToastContext";
|
||||
import { driveKindAbbr, generationStateClass, generationStateLabel } from "./drive/constants";
|
||||
import { generationStateClass, generationStateLabel } from "./drive/constants";
|
||||
import { SpiderIcon } from "./icons/SpiderIcon";
|
||||
|
||||
type CrawlerForm = {
|
||||
id: string;
|
||||
name: string;
|
||||
builtin: string;
|
||||
scriptPath: string;
|
||||
pythonPath: string;
|
||||
targetNew: string;
|
||||
proxy: string;
|
||||
configJson: string;
|
||||
};
|
||||
|
||||
const emptyForm: CrawlerForm = {
|
||||
id: "",
|
||||
name: "",
|
||||
builtin: "",
|
||||
scriptPath: "",
|
||||
pythonPath: "python3",
|
||||
targetNew: "10",
|
||||
proxy: "",
|
||||
configJson: "",
|
||||
};
|
||||
|
||||
export function CrawlersPage() {
|
||||
@@ -37,6 +48,8 @@ export function CrawlersPage() {
|
||||
const [stoppingId, setStoppingId] = useState("");
|
||||
const [scriptURL, setScriptURL] = useState("");
|
||||
const [importingScript, setImportingScript] = useState(false);
|
||||
const [testingScript, setTestingScript] = useState(false);
|
||||
const [testResult, setTestResult] = useState<api.CrawlerDryRunResult | null>(null);
|
||||
const [mode, setMode] = useState<"list" | "detail">("list");
|
||||
const { show } = useToast();
|
||||
|
||||
@@ -44,6 +57,15 @@ export function CrawlersPage() {
|
||||
() => list.find((item) => item.id === selectedId) ?? null,
|
||||
[list, selectedId]
|
||||
);
|
||||
const stats = useMemo(() => {
|
||||
const running = list.filter((item) => item.scanGenerationStatus?.state === "scanning").length;
|
||||
return {
|
||||
total: list.length,
|
||||
ready: list.filter((item) => item.status === "ok").length,
|
||||
running,
|
||||
error: list.filter((item) => item.status === "error").length,
|
||||
};
|
||||
}, [list]);
|
||||
|
||||
async function refresh() {
|
||||
setLoading(true);
|
||||
@@ -64,15 +86,13 @@ export function CrawlersPage() {
|
||||
function selectCrawler(crawler: api.AdminCrawler) {
|
||||
setSelectedId(crawler.id);
|
||||
setMode("detail");
|
||||
setTestResult(null);
|
||||
setForm({
|
||||
id: crawler.id,
|
||||
name: crawler.name,
|
||||
builtin: crawler.builtin ?? "",
|
||||
scriptPath: crawler.scriptPath ?? "",
|
||||
pythonPath: crawler.pythonPath || "python3",
|
||||
targetNew: crawler.targetNew || (crawler.builtin === "spider91" || crawler.kind === "spider91" ? "15" : "10"),
|
||||
targetNew: crawler.targetNew || "10",
|
||||
proxy: crawler.proxy ?? "",
|
||||
configJson: crawler.configJson ?? "",
|
||||
});
|
||||
}
|
||||
|
||||
@@ -80,20 +100,7 @@ export function CrawlersPage() {
|
||||
setSelectedId("");
|
||||
setForm(emptyForm);
|
||||
setScriptURL("");
|
||||
setMode("detail");
|
||||
}
|
||||
|
||||
function createSpider91() {
|
||||
setSelectedId("");
|
||||
setForm({
|
||||
...emptyForm,
|
||||
id: "spider91",
|
||||
name: "91 爬虫",
|
||||
builtin: "spider91",
|
||||
scriptPath: "",
|
||||
targetNew: "15",
|
||||
});
|
||||
setScriptURL("");
|
||||
setTestResult(null);
|
||||
setMode("detail");
|
||||
}
|
||||
|
||||
@@ -101,6 +108,7 @@ export function CrawlersPage() {
|
||||
setSelectedId("");
|
||||
setForm(emptyForm);
|
||||
setScriptURL("");
|
||||
setTestResult(null);
|
||||
setMode("list");
|
||||
}
|
||||
|
||||
@@ -110,33 +118,24 @@ export function CrawlersPage() {
|
||||
|
||||
async function save() {
|
||||
const id = form.id.trim();
|
||||
const name = form.name.trim();
|
||||
if (!id || !name) {
|
||||
show("请填写爬虫 ID 和名称", "error");
|
||||
return;
|
||||
}
|
||||
if (!form.builtin && !form.scriptPath.trim()) {
|
||||
if (!form.scriptPath.trim()) {
|
||||
show("请先导入爬虫脚本", "error");
|
||||
return;
|
||||
}
|
||||
setSaving(true);
|
||||
try {
|
||||
const resp = await api.upsertCrawler({
|
||||
id,
|
||||
name,
|
||||
builtin: form.builtin,
|
||||
id: id || undefined,
|
||||
scriptPath: form.scriptPath.trim(),
|
||||
pythonPath: form.pythonPath.trim(),
|
||||
targetNew: form.targetNew.trim(),
|
||||
proxy: form.proxy.trim(),
|
||||
configJson: form.configJson.trim(),
|
||||
});
|
||||
if (resp.warning) {
|
||||
show(`已保存,但初始化失败:${resp.warning}`, "error");
|
||||
} else {
|
||||
show("已保存", "success");
|
||||
}
|
||||
setSelectedId(id);
|
||||
setSelectedId(resp.id || id);
|
||||
await refresh();
|
||||
setMode("list");
|
||||
} catch (e) {
|
||||
@@ -152,6 +151,8 @@ export function CrawlersPage() {
|
||||
try {
|
||||
const resp = await api.importCrawlerScriptFile(file);
|
||||
set("scriptPath", resp.scriptPath);
|
||||
set("name", resp.name);
|
||||
setTestResult(null);
|
||||
show("脚本已导入", "success");
|
||||
} catch (e) {
|
||||
show(e instanceof Error ? e.message : "导入失败", "error");
|
||||
@@ -170,7 +171,9 @@ export function CrawlersPage() {
|
||||
try {
|
||||
const resp = await api.importCrawlerScriptURL(url);
|
||||
set("scriptPath", resp.scriptPath);
|
||||
set("name", resp.name);
|
||||
setScriptURL("");
|
||||
setTestResult(null);
|
||||
show("脚本已导入", "success");
|
||||
} catch (e) {
|
||||
show(e instanceof Error ? e.message : "导入失败", "error");
|
||||
@@ -179,6 +182,32 @@ export function CrawlersPage() {
|
||||
}
|
||||
}
|
||||
|
||||
async function testScript() {
|
||||
const scriptPath = form.scriptPath.trim();
|
||||
if (!scriptPath) {
|
||||
show("请先导入爬虫脚本", "error");
|
||||
return;
|
||||
}
|
||||
setTestingScript(true);
|
||||
setTestResult(null);
|
||||
try {
|
||||
const result = await api.testCrawlerScript({
|
||||
scriptPath,
|
||||
proxy: form.proxy.trim(),
|
||||
});
|
||||
setTestResult(result);
|
||||
if (result.ok) {
|
||||
show("测试通过", "success");
|
||||
} else {
|
||||
show(crawlerTestFailure(result) || "测试失败", "error");
|
||||
}
|
||||
} catch (e) {
|
||||
show(e instanceof Error ? e.message : "测试失败", "error");
|
||||
} finally {
|
||||
setTestingScript(false);
|
||||
}
|
||||
}
|
||||
|
||||
async function run(crawler: api.AdminCrawler) {
|
||||
setRunningId(crawler.id);
|
||||
try {
|
||||
@@ -210,10 +239,16 @@ export function CrawlersPage() {
|
||||
}
|
||||
|
||||
async function remove(crawler: api.AdminCrawler) {
|
||||
if (!window.confirm(`删除爬虫 ${crawler.name} 并清理它导入的视频?`)) return;
|
||||
if (!window.confirm(`删除爬虫 ${crawler.name} 的脚本和配置?已爬取的视频会保留。`)) return;
|
||||
try {
|
||||
const resp = await api.deleteCrawler(crawler.id);
|
||||
show(`已删除,并清理 ${resp.deletedVideos ?? 0} 个视频`, "success");
|
||||
if (resp.warning) {
|
||||
show(`已删除爬虫配置,但脚本文件清理失败:${resp.warning}`, "error");
|
||||
} else if (resp.deletedScript) {
|
||||
show("已删除爬虫配置和脚本文件,已爬取视频保留", "success");
|
||||
} else {
|
||||
show("已删除爬虫配置,已爬取视频保留", "success");
|
||||
}
|
||||
setSelectedId("");
|
||||
setForm(emptyForm);
|
||||
setMode("list");
|
||||
@@ -243,63 +278,74 @@ export function CrawlersPage() {
|
||||
</header>
|
||||
|
||||
{mode === "list" ? (
|
||||
<div className="admin-crawler-console">
|
||||
<div className="admin-crawler-overview">
|
||||
<CrawlerMetric label="已配置" value={stats.total} icon={<SpiderIcon size={16} />} />
|
||||
<CrawlerMetric label="已就绪" value={stats.ready} icon={<Activity size={16} />} tone="ok" />
|
||||
<CrawlerMetric label="抓取中" value={stats.running} icon={<RefreshCw size={16} />} tone="info" />
|
||||
<CrawlerMetric label="错误" value={stats.error} icon={<CircleStop size={16} />} tone="error" />
|
||||
</div>
|
||||
|
||||
<div className="admin-card admin-crawler-list">
|
||||
<div className="admin-crawler-list__head">
|
||||
<header className="admin-card__title">
|
||||
<SpiderIcon size={16} /> 已配置爬虫
|
||||
</header>
|
||||
{loading ? (
|
||||
<div className="admin-loading">加载中...</div>
|
||||
) : list.length === 0 ? (
|
||||
<div className="admin-empty">暂无爬虫</div>
|
||||
) : (
|
||||
<div className="admin-drive-teasers">
|
||||
{list.map((crawler) => (
|
||||
<button
|
||||
key={crawler.id}
|
||||
type="button"
|
||||
className={`admin-drive-teaser ${crawler.id === selectedId ? "is-active" : ""}`}
|
||||
onClick={() => selectCrawler(crawler)}
|
||||
>
|
||||
<span className="admin-drive-teaser__name">
|
||||
<span className="admin-drive-card__brand-icon" data-kind={crawler.builtin || crawler.kind}>
|
||||
{crawler.builtin === "spider91" ? "91" : driveKindAbbr(crawler.kind)}
|
||||
</span>
|
||||
{crawler.name}
|
||||
</span>
|
||||
<span className={`admin-status is-${crawler.status === "ok" ? "ok" : crawler.status === "error" ? "error" : "pending"}`}>
|
||||
{crawler.status === "ok" ? "已就绪" : crawler.status === "error" ? "错误" : "未连接"}
|
||||
</span>
|
||||
<button className="admin-btn" type="button" onClick={refresh} disabled={loading}>
|
||||
<RefreshCw size={13} className={loading ? "admin-spin" : undefined} /> 刷新
|
||||
</button>
|
||||
</div>
|
||||
{loading ? (
|
||||
<div className="admin-loading-state">
|
||||
<RefreshCw size={18} className="admin-spin" />
|
||||
<span>加载中...</span>
|
||||
</div>
|
||||
) : list.length === 0 ? (
|
||||
<div className="admin-crawler-empty">
|
||||
<SpiderIcon size={28} />
|
||||
<strong>暂无爬虫</strong>
|
||||
<button className="admin-btn is-primary" type="button" onClick={createCustom}>
|
||||
<Plus size={13} /> 添加爬虫
|
||||
</button>
|
||||
</div>
|
||||
) : (
|
||||
<div className="admin-crawler-table">
|
||||
{list.map((crawler) => (
|
||||
<CrawlerRow
|
||||
key={crawler.id}
|
||||
crawler={crawler}
|
||||
active={crawler.id === selectedId}
|
||||
running={runningId === crawler.id}
|
||||
stopping={stoppingId === crawler.id}
|
||||
onSelect={() => selectCrawler(crawler)}
|
||||
onRun={() => run(crawler)}
|
||||
onStop={() => stop(crawler)}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
) : (
|
||||
<div className="admin-crawler-detail">
|
||||
<div className="admin-card">
|
||||
<header className="admin-card__title">
|
||||
<SpiderIcon size={16} /> {selected ? "爬虫配置" : "添加爬虫"}
|
||||
</header>
|
||||
<div className="admin-crawler-editor">
|
||||
<div className="admin-crawler-editor__main">
|
||||
<div className="admin-crawler-section">
|
||||
<div className="admin-crawler-section__head">
|
||||
<span className="admin-crawler-section__icon"><Settings2 size={15} /></span>
|
||||
<span className="admin-crawler-section__title">基础信息</span>
|
||||
</div>
|
||||
<div className="admin-crawler-script-name">
|
||||
<span>脚本名称</span>
|
||||
<strong>{form.name || "导入脚本后自动读取"}</strong>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="admin-crawler-section">
|
||||
<div className="admin-crawler-section__head">
|
||||
<span className="admin-crawler-section__icon"><FileCode2 size={15} /></span>
|
||||
<span className="admin-crawler-section__title">脚本导入与测试</span>
|
||||
</div>
|
||||
<div className="admin-form">
|
||||
{!selected && (
|
||||
<div className="admin-crawler-presets">
|
||||
<button className={`admin-btn ${form.builtin === "" ? "is-primary" : ""}`} type="button" onClick={createCustom}>
|
||||
<Plus size={13} /> 自定义脚本
|
||||
</button>
|
||||
<button className={`admin-btn ${form.builtin === "spider91" ? "is-primary" : ""}`} type="button" onClick={createSpider91}>
|
||||
<SpiderIcon size={13} /> 内置 91
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
<div className="admin-form__row">
|
||||
<label htmlFor="crawler-id">爬虫 ID *</label>
|
||||
<input id="crawler-id" value={form.id} onChange={(e) => set("id", e.target.value)} disabled={!!selected} />
|
||||
</div>
|
||||
<div className="admin-form__row">
|
||||
<label htmlFor="crawler-name">名称 *</label>
|
||||
<input id="crawler-name" value={form.name} onChange={(e) => set("name", e.target.value)} />
|
||||
</div>
|
||||
{!form.builtin && (
|
||||
<div className="admin-form__row">
|
||||
<label htmlFor="crawler-script-url">导入脚本</label>
|
||||
<div className="admin-crawler-import">
|
||||
@@ -327,19 +373,59 @@ export function CrawlersPage() {
|
||||
<button className="admin-btn" type="button" onClick={importScriptURL} disabled={importingScript}>
|
||||
<LinkIcon size={13} /> {importingScript ? "导入中..." : "链接导入"}
|
||||
</button>
|
||||
<button
|
||||
className="admin-btn"
|
||||
type="button"
|
||||
onClick={testScript}
|
||||
disabled={!form.scriptPath || importingScript || testingScript}
|
||||
>
|
||||
<TestTube size={13} /> {testingScript ? "测试中..." : "测试脚本"}
|
||||
</button>
|
||||
</div>
|
||||
{form.scriptPath && <div className="admin-form__help">脚本已导入</div>}
|
||||
{testResult && <CrawlerTestResult result={testResult} />}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="admin-crawler-section">
|
||||
<div className="admin-crawler-section__head">
|
||||
<span className="admin-crawler-section__icon"><Gauge size={15} /></span>
|
||||
<span className="admin-crawler-section__title">运行参数</span>
|
||||
</div>
|
||||
<div className="admin-crawler-params">
|
||||
<div className="admin-form__row">
|
||||
<label htmlFor="crawler-target">每次补充新视频数</label>
|
||||
<input id="crawler-target" value={form.targetNew} onChange={(e) => set("targetNew", e.target.value)} placeholder="10" />
|
||||
</div>
|
||||
<div className="admin-form__row">
|
||||
<label htmlFor="crawler-proxy">代理地址</label>
|
||||
<input id="crawler-proxy" value={form.proxy} onChange={(e) => set("proxy", e.target.value)} placeholder="http://127.0.0.1:7890" />
|
||||
<input
|
||||
id="crawler-proxy"
|
||||
value={form.proxy}
|
||||
onChange={(e) => {
|
||||
set("proxy", e.target.value);
|
||||
setTestResult(null);
|
||||
}}
|
||||
placeholder="http://127.0.0.1:7890"
|
||||
/>
|
||||
</div>
|
||||
<div className="admin-detail-actions">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<aside className="admin-crawler-editor__side">
|
||||
<div className="admin-crawler-action-panel">
|
||||
<div className="admin-crawler-action-panel__head">
|
||||
<span className="admin-crawler-action-panel__mark">
|
||||
<SpiderIcon size={18} />
|
||||
</span>
|
||||
<div>
|
||||
<strong>{selected ? "爬虫配置" : "添加爬虫"}</strong>
|
||||
<span>{selected ? crawlerStatusLabel(selected) : "未保存"}</span>
|
||||
</div>
|
||||
</div>
|
||||
<div className="admin-crawler-action-panel__buttons">
|
||||
<button className="admin-btn is-primary" onClick={save} disabled={saving}>
|
||||
<Save size={13} /> {saving ? "保存中..." : "保存"}
|
||||
</button>
|
||||
@@ -358,14 +444,14 @@ export function CrawlersPage() {
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{selected && (
|
||||
<div className="admin-card admin-crawler-status">
|
||||
<header className="admin-card__title">
|
||||
<Download size={16} /> 状态
|
||||
</header>
|
||||
<div className="admin-gen-columns">
|
||||
<div className="admin-crawler-side-panel">
|
||||
<div className="admin-crawler-section__head">
|
||||
<span className="admin-crawler-section__icon"><Activity size={15} /></span>
|
||||
<span className="admin-crawler-section__title">任务状态</span>
|
||||
</div>
|
||||
<div className="admin-crawler-status-grid">
|
||||
<CrawlerStatus label="抓取" status={selected.scanGenerationStatus} />
|
||||
<CrawlerStatus label="封面" status={selected.thumbnailGenerationStatus} />
|
||||
<CrawlerStatus label="预览视频" status={selected.previewGenerationStatus} />
|
||||
@@ -374,12 +460,89 @@ export function CrawlersPage() {
|
||||
{selected.lastError && <div className="admin-detail-error">{selected.lastError}</div>}
|
||||
</div>
|
||||
)}
|
||||
</aside>
|
||||
</div>
|
||||
)}
|
||||
</section>
|
||||
);
|
||||
}
|
||||
|
||||
function CrawlerMetric({ label, value, icon, tone }: { label: string; value: number; icon: ReactNode; tone?: "ok" | "info" | "error" }) {
|
||||
return (
|
||||
<div className={`admin-crawler-metric ${tone ? `is-${tone}` : ""}`}>
|
||||
<span className="admin-crawler-metric__icon">{icon}</span>
|
||||
<span>{label}</span>
|
||||
<strong>{value}</strong>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function CrawlerRow({
|
||||
crawler,
|
||||
active,
|
||||
running,
|
||||
stopping,
|
||||
onSelect,
|
||||
onRun,
|
||||
onStop,
|
||||
}: {
|
||||
crawler: api.AdminCrawler;
|
||||
active: boolean;
|
||||
running: boolean;
|
||||
stopping: boolean;
|
||||
onSelect: () => void;
|
||||
onRun: () => void;
|
||||
onStop: () => void;
|
||||
}) {
|
||||
return (
|
||||
<div className={`admin-crawler-row ${active ? "is-active" : ""}`}>
|
||||
<button type="button" className="admin-crawler-row__main" onClick={onSelect}>
|
||||
<span className="admin-crawler-row__brand">
|
||||
<SpiderIcon size={16} />
|
||||
</span>
|
||||
<span className="admin-crawler-row__title-wrap">
|
||||
<strong>{crawler.name}</strong>
|
||||
<span>{crawler.scriptPath ? "脚本已导入" : "未导入脚本"}</span>
|
||||
</span>
|
||||
<span className={`admin-status is-${crawler.status === "ok" ? "ok" : crawler.status === "error" ? "error" : "pending"}`}>
|
||||
{crawlerStatusLabel(crawler)}
|
||||
</span>
|
||||
<ChevronRight size={16} className="admin-crawler-row__chevron" />
|
||||
</button>
|
||||
<div className="admin-crawler-row__states">
|
||||
<CrawlerStateChip label="抓取" status={crawler.scanGenerationStatus} />
|
||||
<CrawlerStateChip label="封面" status={crawler.thumbnailGenerationStatus} />
|
||||
<CrawlerStateChip label="预览" status={crawler.previewGenerationStatus} />
|
||||
<CrawlerStateChip label="指纹" status={crawler.fingerprintGenerationStatus} />
|
||||
</div>
|
||||
<div className="admin-crawler-row__meta">
|
||||
<span><Gauge size={12} /> {crawler.targetNew || "10"} 条</span>
|
||||
<span><Clock size={12} /> {formatLastCrawl(crawler.lastCrawlAt)}</span>
|
||||
</div>
|
||||
<div className="admin-crawler-row__actions">
|
||||
<button className="admin-btn" type="button" onClick={onSelect}>
|
||||
<Settings2 size={13} /> 管理
|
||||
</button>
|
||||
<button className="admin-btn" type="button" onClick={onRun} disabled={running}>
|
||||
<Download size={13} /> {running ? "触发中..." : "立即抓取"}
|
||||
</button>
|
||||
<button className="admin-btn is-stop" type="button" onClick={onStop} disabled={stopping}>
|
||||
<CircleStop size={13} /> {stopping ? "停止中..." : "停止"}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function CrawlerStateChip({ label, status }: { label: string; status?: api.DriveGenerationStatus }) {
|
||||
const state = status?.state || "idle";
|
||||
return (
|
||||
<span className={`admin-crawler-state-chip is-${generationStateClass(state)}`}>
|
||||
{label} · {label === "抓取" && state === "scanning" ? "抓取中" : generationStateLabel(state)}
|
||||
</span>
|
||||
);
|
||||
}
|
||||
|
||||
function CrawlerStatus({ label, status }: { label: string; status?: api.DriveGenerationStatus }) {
|
||||
const state = status?.state || "idle";
|
||||
const labelText = label === "抓取" && state === "scanning" ? "抓取中" : generationStateLabel(state);
|
||||
@@ -400,3 +563,89 @@ function CrawlerStatus({ label, status }: { label: string; status?: api.DriveGen
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function CrawlerTestResult({ result }: { result: api.CrawlerDryRunResult }) {
|
||||
const item = result.items[0];
|
||||
const failure = crawlerTestFailure(result);
|
||||
const media = result.mediaCheck;
|
||||
const statusText = result.ok ? "测试通过" : "测试失败";
|
||||
|
||||
return (
|
||||
<div className={`admin-crawler-test-result ${result.ok ? "is-ok" : "is-error"}`}>
|
||||
<div className="admin-crawler-test-result__head">
|
||||
<span className={`admin-status is-${result.ok ? "ok" : "error"}`}>{statusText}</span>
|
||||
<span>抓取到 {result.items.length} 条视频</span>
|
||||
{result.durationMs > 0 && <span>{Math.round(result.durationMs / 1000)} 秒</span>}
|
||||
</div>
|
||||
|
||||
{failure && <div className="admin-crawler-test-result__error">{failure}</div>}
|
||||
|
||||
{item && (
|
||||
<div className="admin-crawler-test-result__grid">
|
||||
<CrawlerTestField label="视频名" value={item.title} />
|
||||
<CrawlerTestField label="唯一标识" value={item.sourceId} />
|
||||
<CrawlerTestField label="视频直链" value={item.mediaUrl || item.mediaLocalFile} />
|
||||
<CrawlerTestField label="封面图" value={item.thumbnailUrl} />
|
||||
<CrawlerTestField label="详情页" value={item.detailUrl} />
|
||||
</div>
|
||||
)}
|
||||
|
||||
{media && (
|
||||
<div className="admin-crawler-test-result__media">
|
||||
<span>直链校验</span>
|
||||
<strong>
|
||||
{media.ok ? "可访问" : "不可访问"}
|
||||
{media.status ? ` · HTTP ${media.status}` : ""}
|
||||
{media.contentType ? ` · ${media.contentType}` : ""}
|
||||
{media.contentLengthBytes ? ` · ${formatBytes(media.contentLengthBytes)}` : ""}
|
||||
</strong>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{result.log && result.log.length > 0 && (
|
||||
<details className="admin-crawler-test-result__log">
|
||||
<summary>脚本日志</summary>
|
||||
<pre>{result.log.join("\n")}</pre>
|
||||
</details>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function CrawlerTestField({ label, value }: { label: string; value?: string | number }) {
|
||||
if (value === undefined || value === "") return null;
|
||||
return (
|
||||
<div className="admin-crawler-test-result__field">
|
||||
<span>{label}</span>
|
||||
<strong>{value}</strong>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function crawlerTestFailure(result: api.CrawlerDryRunResult) {
|
||||
return result.error || result.mediaCheck?.error || "";
|
||||
}
|
||||
|
||||
function crawlerStatusLabel(crawler: api.AdminCrawler) {
|
||||
if (crawler.status === "ok") return "已就绪";
|
||||
if (crawler.status === "error") return "错误";
|
||||
return "未连接";
|
||||
}
|
||||
|
||||
function formatLastCrawl(ts?: number) {
|
||||
if (!ts) return "未抓取";
|
||||
return new Date(ts * 1000).toLocaleString("zh-CN", {
|
||||
month: "2-digit",
|
||||
day: "2-digit",
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
});
|
||||
}
|
||||
|
||||
function formatBytes(bytes: number) {
|
||||
if (!Number.isFinite(bytes) || bytes <= 0) return "";
|
||||
if (bytes >= 1024 * 1024 * 1024) return `${(bytes / 1024 / 1024 / 1024).toFixed(1)} GB`;
|
||||
if (bytes >= 1024 * 1024) return `${(bytes / 1024 / 1024).toFixed(1)} MB`;
|
||||
if (bytes >= 1024) return `${(bytes / 1024).toFixed(1)} KB`;
|
||||
return `${bytes} B`;
|
||||
}
|
||||
|
||||
+37
-11
@@ -195,14 +195,11 @@ export type AdminCrawler = {
|
||||
id: string;
|
||||
name: string;
|
||||
kind: "scriptcrawler" | "spider91";
|
||||
builtin?: string;
|
||||
status: string;
|
||||
lastError?: string;
|
||||
scriptPath: string;
|
||||
pythonPath?: string;
|
||||
proxy?: string;
|
||||
targetNew?: string;
|
||||
configJson?: string;
|
||||
lastCrawlAt?: number;
|
||||
scanGenerationStatus?: DriveGenerationStatus;
|
||||
thumbnailGenerationStatus?: DriveGenerationStatus;
|
||||
@@ -220,18 +217,41 @@ export type AdminCrawler = {
|
||||
};
|
||||
|
||||
export type UpsertCrawlerInput = {
|
||||
id: string;
|
||||
name: string;
|
||||
builtin?: string;
|
||||
id?: string;
|
||||
scriptPath: string;
|
||||
pythonPath?: string;
|
||||
proxy?: string;
|
||||
targetNew?: string;
|
||||
configJson?: string;
|
||||
};
|
||||
|
||||
export type ImportCrawlerScriptResult = {
|
||||
scriptPath: string;
|
||||
name: string;
|
||||
};
|
||||
|
||||
export type CrawlerDryRunItem = {
|
||||
title: string;
|
||||
sourceId?: string;
|
||||
mediaUrl?: string;
|
||||
mediaLocalFile?: string;
|
||||
thumbnailUrl?: string;
|
||||
detailUrl?: string;
|
||||
};
|
||||
|
||||
export type CrawlerDryRunMediaCheck = {
|
||||
ok: boolean;
|
||||
status?: number;
|
||||
contentType?: string;
|
||||
contentLengthBytes?: number;
|
||||
error?: string;
|
||||
};
|
||||
|
||||
export type CrawlerDryRunResult = {
|
||||
ok: boolean;
|
||||
items: CrawlerDryRunItem[];
|
||||
mediaCheck?: CrawlerDryRunMediaCheck;
|
||||
error?: string;
|
||||
log?: string[];
|
||||
durationMs: number;
|
||||
};
|
||||
|
||||
export function listCrawlers() {
|
||||
@@ -239,7 +259,7 @@ export function listCrawlers() {
|
||||
}
|
||||
|
||||
export function upsertCrawler(body: UpsertCrawlerInput) {
|
||||
return request<{ ok: boolean; warning?: string }>("/crawlers", {
|
||||
return request<{ ok: boolean; id: string; warning?: string }>("/crawlers", {
|
||||
method: "POST",
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
@@ -261,6 +281,13 @@ export function importCrawlerScriptURL(url: string) {
|
||||
});
|
||||
}
|
||||
|
||||
export function testCrawlerScript(body: { scriptPath: string; proxy?: string }) {
|
||||
return request<CrawlerDryRunResult>("/crawlers/test-script", {
|
||||
method: "POST",
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
}
|
||||
|
||||
export function runCrawler(id: string) {
|
||||
return request<{ ok: boolean; accepted: boolean; message?: string; status?: NightlyJobStatus }>(
|
||||
`/crawlers/${encodeURIComponent(id)}/run`,
|
||||
@@ -276,9 +303,8 @@ export function stopCrawlerTasks(id: string) {
|
||||
}
|
||||
|
||||
export function deleteCrawler(id: string) {
|
||||
return request<{ ok: boolean; deletedVideos: number }>(`/crawlers/${encodeURIComponent(id)}`, {
|
||||
return request<{ ok: boolean; deletedVideos: number; deletedScript?: boolean; warning?: string }>(`/crawlers/${encodeURIComponent(id)}`, {
|
||||
method: "DELETE",
|
||||
body: JSON.stringify({ deleteVideos: true }),
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -163,7 +163,7 @@ export function credentialHelp(kind: Kind, isEdit: boolean): string {
|
||||
case "localstorage":
|
||||
return `填写服务器可访问的本地目录绝对路径,例如 /mnt/videos。系统会扫描该目录及子目录中的视频文件和 .strm 文件;.strm 可指向 HTTP/HTTPS 直链,或指向本地存储根目录内的真实视频路径。Docker 部署时请填写容器内路径。${note}`;
|
||||
case "spider91":
|
||||
return "91Spider 不再支持通过网盘添加或编辑。请到后台爬虫管理页面添加内置 91 或自定义爬虫脚本。";
|
||||
return "91Spider 不再支持通过网盘添加或编辑。请到后台爬虫管理页面添加爬虫脚本。";
|
||||
default:
|
||||
return "";
|
||||
}
|
||||
|
||||
+617
-1
@@ -335,6 +335,541 @@
|
||||
margin-bottom: var(--space-3);
|
||||
}
|
||||
|
||||
/* =========================================================
|
||||
* Crawler Management
|
||||
* ========================================================= */
|
||||
.admin-crawler-console {
|
||||
display: grid;
|
||||
gap: var(--space-4);
|
||||
}
|
||||
|
||||
.admin-crawler-overview {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(4, minmax(0, 1fr));
|
||||
gap: var(--space-3);
|
||||
}
|
||||
|
||||
.admin-crawler-metric {
|
||||
display: grid;
|
||||
grid-template-columns: 38px minmax(0, 1fr);
|
||||
grid-template-areas:
|
||||
"icon label"
|
||||
"icon value";
|
||||
align-items: center;
|
||||
min-height: 76px;
|
||||
padding: var(--space-4);
|
||||
border: 1px solid var(--border-subtle);
|
||||
border-radius: var(--radius-sm);
|
||||
background: var(--bg-surface);
|
||||
box-shadow: var(--shadow-sm);
|
||||
}
|
||||
|
||||
.admin-crawler-metric__icon {
|
||||
grid-area: icon;
|
||||
width: 34px;
|
||||
height: 34px;
|
||||
display: grid;
|
||||
place-items: center;
|
||||
border-radius: var(--radius-xs);
|
||||
color: var(--accent);
|
||||
background: var(--accent-soft);
|
||||
}
|
||||
|
||||
.admin-crawler-metric span:not(.admin-crawler-metric__icon) {
|
||||
grid-area: label;
|
||||
color: var(--text-faint);
|
||||
font-size: var(--font-xs);
|
||||
font-weight: var(--weight-medium);
|
||||
}
|
||||
|
||||
.admin-crawler-metric strong {
|
||||
grid-area: value;
|
||||
color: var(--text-strong);
|
||||
font-size: var(--font-2xl);
|
||||
font-weight: var(--weight-bold);
|
||||
line-height: 1.1;
|
||||
font-variant-numeric: tabular-nums;
|
||||
}
|
||||
|
||||
.admin-crawler-metric.is-ok .admin-crawler-metric__icon {
|
||||
color: var(--success);
|
||||
background: var(--success-soft);
|
||||
}
|
||||
|
||||
.admin-crawler-metric.is-info .admin-crawler-metric__icon {
|
||||
color: var(--info);
|
||||
background: var(--info-soft);
|
||||
}
|
||||
|
||||
.admin-crawler-metric.is-error .admin-crawler-metric__icon {
|
||||
color: var(--danger);
|
||||
background: var(--danger-soft);
|
||||
}
|
||||
|
||||
.admin-crawler-list {
|
||||
padding: 0;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.admin-crawler-list__head {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
gap: var(--space-3);
|
||||
padding: var(--space-4) var(--space-5);
|
||||
border-bottom: 1px solid var(--border-subtle);
|
||||
}
|
||||
|
||||
.admin-crawler-list__head .admin-card__title {
|
||||
margin-bottom: 0;
|
||||
}
|
||||
|
||||
.admin-spin {
|
||||
animation: admin-update-spin 0.9s linear infinite;
|
||||
transform-box: fill-box;
|
||||
transform-origin: center;
|
||||
will-change: transform;
|
||||
}
|
||||
|
||||
.admin-crawler-empty {
|
||||
min-height: 280px;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: var(--space-3);
|
||||
padding: var(--space-7) var(--space-4);
|
||||
color: var(--text-faint);
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.admin-crawler-empty svg {
|
||||
color: var(--accent);
|
||||
}
|
||||
|
||||
.admin-crawler-empty strong {
|
||||
color: var(--text-strong);
|
||||
font-size: var(--font-lg);
|
||||
}
|
||||
|
||||
.admin-crawler-table {
|
||||
display: grid;
|
||||
}
|
||||
|
||||
.admin-crawler-row {
|
||||
display: grid;
|
||||
grid-template-columns: minmax(260px, 1.25fr) minmax(260px, 1fr) minmax(150px, 0.55fr) auto;
|
||||
align-items: center;
|
||||
gap: var(--space-3);
|
||||
padding: var(--space-3) var(--space-5);
|
||||
border-bottom: 1px solid var(--border-subtle);
|
||||
background: transparent;
|
||||
transition: background var(--transition-fast), border-color var(--transition-fast);
|
||||
}
|
||||
|
||||
.admin-crawler-row:last-child {
|
||||
border-bottom: 0;
|
||||
}
|
||||
|
||||
.admin-crawler-row:hover,
|
||||
.admin-crawler-row.is-active {
|
||||
background: rgba(255, 255, 255, 0.025);
|
||||
}
|
||||
|
||||
.admin-crawler-row.is-active {
|
||||
box-shadow: inset 3px 0 0 var(--accent);
|
||||
}
|
||||
|
||||
.admin-crawler-row__main {
|
||||
appearance: none;
|
||||
width: 100%;
|
||||
min-width: 0;
|
||||
display: grid;
|
||||
grid-template-columns: 38px minmax(0, 1fr) auto 18px;
|
||||
align-items: center;
|
||||
gap: var(--space-3);
|
||||
padding: 0;
|
||||
border: 0;
|
||||
background: transparent;
|
||||
color: inherit;
|
||||
font: inherit;
|
||||
text-align: left;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.admin-crawler-row__main:focus-visible {
|
||||
outline: 2px solid var(--accent);
|
||||
outline-offset: 4px;
|
||||
border-radius: var(--radius-sm);
|
||||
}
|
||||
|
||||
.admin-crawler-row__brand {
|
||||
width: 38px;
|
||||
height: 38px;
|
||||
display: grid;
|
||||
place-items: center;
|
||||
border-radius: var(--radius-xs);
|
||||
color: var(--accent);
|
||||
background: var(--accent-soft);
|
||||
border: 1px solid rgba(255, 138, 60, 0.2);
|
||||
}
|
||||
|
||||
.admin-crawler-row__title-wrap {
|
||||
min-width: 0;
|
||||
display: grid;
|
||||
gap: 3px;
|
||||
}
|
||||
|
||||
.admin-crawler-row__title-wrap strong {
|
||||
min-width: 0;
|
||||
color: var(--text-strong);
|
||||
font-size: var(--font-md);
|
||||
font-weight: var(--weight-semibold);
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.admin-crawler-row__title-wrap span {
|
||||
color: var(--text-faint);
|
||||
font-size: var(--font-xs);
|
||||
}
|
||||
|
||||
.admin-crawler-row__chevron {
|
||||
color: var(--text-faint);
|
||||
transition: transform var(--transition-fast), color var(--transition-fast);
|
||||
}
|
||||
|
||||
.admin-crawler-row__main:hover .admin-crawler-row__chevron {
|
||||
color: var(--text-strong);
|
||||
transform: translateX(2px);
|
||||
}
|
||||
|
||||
.admin-crawler-row__states {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
min-width: 0;
|
||||
}
|
||||
|
||||
.admin-crawler-state-chip {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
min-height: 24px;
|
||||
padding: 3px 8px;
|
||||
border-radius: var(--radius-pill);
|
||||
border: 1px solid transparent;
|
||||
font-size: var(--font-xs);
|
||||
font-weight: var(--weight-medium);
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.admin-crawler-state-chip.is-idle {
|
||||
color: var(--success);
|
||||
background: var(--success-soft);
|
||||
}
|
||||
|
||||
.admin-crawler-state-chip.is-generating {
|
||||
color: var(--info);
|
||||
background: var(--info-soft);
|
||||
}
|
||||
|
||||
.admin-crawler-state-chip.is-cooling {
|
||||
color: var(--warning);
|
||||
background: var(--warning-soft);
|
||||
}
|
||||
|
||||
.admin-crawler-state-chip.is-queued {
|
||||
color: var(--text-muted);
|
||||
background: rgba(255, 255, 255, 0.06);
|
||||
border-color: var(--border-subtle);
|
||||
}
|
||||
|
||||
.admin-crawler-row__meta {
|
||||
display: grid;
|
||||
gap: 6px;
|
||||
color: var(--text-muted);
|
||||
font-size: var(--font-xs);
|
||||
}
|
||||
|
||||
.admin-crawler-row__meta span {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
min-width: 0;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.admin-crawler-row__actions {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: flex-end;
|
||||
flex-wrap: wrap;
|
||||
gap: var(--space-2);
|
||||
}
|
||||
|
||||
.admin-crawler-editor {
|
||||
display: grid;
|
||||
grid-template-columns: minmax(0, 1fr) 320px;
|
||||
gap: var(--space-5);
|
||||
align-items: start;
|
||||
}
|
||||
|
||||
.admin-crawler-editor__main {
|
||||
display: grid;
|
||||
gap: var(--space-4);
|
||||
min-width: 0;
|
||||
}
|
||||
|
||||
.admin-crawler-editor__side {
|
||||
display: grid;
|
||||
gap: var(--space-4);
|
||||
position: sticky;
|
||||
top: var(--space-5);
|
||||
min-width: 0;
|
||||
}
|
||||
|
||||
.admin-crawler-section,
|
||||
.admin-crawler-action-panel,
|
||||
.admin-crawler-side-panel {
|
||||
border: 1px solid var(--border-subtle);
|
||||
border-radius: var(--radius-sm);
|
||||
background: var(--bg-surface);
|
||||
box-shadow: var(--shadow-sm);
|
||||
}
|
||||
|
||||
.admin-crawler-section {
|
||||
padding: var(--space-5);
|
||||
}
|
||||
|
||||
.admin-crawler-section__head {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: var(--space-2);
|
||||
margin-bottom: var(--space-4);
|
||||
}
|
||||
|
||||
.admin-crawler-section__icon {
|
||||
width: 28px;
|
||||
height: 28px;
|
||||
display: grid;
|
||||
place-items: center;
|
||||
border-radius: var(--radius-xs);
|
||||
color: var(--accent);
|
||||
background: var(--accent-soft);
|
||||
flex: 0 0 auto;
|
||||
}
|
||||
|
||||
.admin-crawler-section__title {
|
||||
color: var(--text-strong);
|
||||
font-size: var(--font-md);
|
||||
font-weight: var(--weight-semibold);
|
||||
}
|
||||
|
||||
.admin-crawler-section .admin-form {
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
.admin-crawler-script-name {
|
||||
display: grid;
|
||||
grid-template-columns: 92px minmax(0, 1fr);
|
||||
gap: var(--space-3);
|
||||
align-items: center;
|
||||
min-height: 42px;
|
||||
padding: var(--space-3);
|
||||
border: 1px solid var(--border-subtle);
|
||||
border-radius: var(--radius-sm);
|
||||
background: var(--bg-sunken);
|
||||
}
|
||||
|
||||
.admin-crawler-script-name span {
|
||||
color: var(--text-faint);
|
||||
font-size: var(--font-xs);
|
||||
font-weight: var(--weight-medium);
|
||||
}
|
||||
|
||||
.admin-crawler-script-name strong {
|
||||
min-width: 0;
|
||||
color: var(--text-strong);
|
||||
font-size: var(--font-sm);
|
||||
font-weight: var(--weight-semibold);
|
||||
overflow-wrap: anywhere;
|
||||
}
|
||||
|
||||
.admin-crawler-params {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(2, minmax(0, 1fr));
|
||||
gap: var(--space-4);
|
||||
}
|
||||
|
||||
.admin-crawler-action-panel {
|
||||
padding: var(--space-4);
|
||||
}
|
||||
|
||||
.admin-crawler-action-panel__head {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: var(--space-3);
|
||||
padding-bottom: var(--space-4);
|
||||
border-bottom: 1px solid var(--border-subtle);
|
||||
}
|
||||
|
||||
.admin-crawler-action-panel__mark {
|
||||
width: 42px;
|
||||
height: 42px;
|
||||
display: grid;
|
||||
place-items: center;
|
||||
border-radius: var(--radius-xs);
|
||||
color: var(--text-on-accent);
|
||||
background: var(--accent);
|
||||
box-shadow: var(--shadow-sm);
|
||||
}
|
||||
|
||||
.admin-crawler-action-panel__head > div {
|
||||
display: grid;
|
||||
gap: 3px;
|
||||
min-width: 0;
|
||||
}
|
||||
|
||||
.admin-crawler-action-panel__head strong {
|
||||
color: var(--text-strong);
|
||||
font-size: var(--font-md);
|
||||
font-weight: var(--weight-semibold);
|
||||
}
|
||||
|
||||
.admin-crawler-action-panel__head > div span {
|
||||
color: var(--text-faint);
|
||||
font-size: var(--font-xs);
|
||||
}
|
||||
|
||||
.admin-crawler-action-panel__buttons {
|
||||
display: grid;
|
||||
gap: var(--space-2);
|
||||
padding-top: var(--space-4);
|
||||
}
|
||||
|
||||
.admin-crawler-action-panel__buttons .admin-btn {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.admin-crawler-side-panel {
|
||||
padding: var(--space-4);
|
||||
}
|
||||
|
||||
.admin-crawler-side-panel .admin-crawler-section__head {
|
||||
margin-bottom: var(--space-3);
|
||||
}
|
||||
|
||||
.admin-crawler-status-grid {
|
||||
display: grid;
|
||||
gap: var(--space-3);
|
||||
}
|
||||
|
||||
.admin-crawler-status-grid .admin-gen-col {
|
||||
background: var(--bg-sunken);
|
||||
}
|
||||
|
||||
.admin-btn[aria-disabled="true"] {
|
||||
opacity: 0.45;
|
||||
cursor: not-allowed;
|
||||
pointer-events: none;
|
||||
}
|
||||
|
||||
@media (max-width: 1180px) {
|
||||
.admin-crawler-row {
|
||||
grid-template-columns: minmax(260px, 1fr) minmax(220px, 0.9fr);
|
||||
}
|
||||
|
||||
.admin-crawler-row__meta,
|
||||
.admin-crawler-row__actions {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.admin-crawler-row__meta {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: var(--space-3);
|
||||
}
|
||||
|
||||
.admin-crawler-row__actions {
|
||||
justify-content: flex-start;
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 1024px) {
|
||||
.admin-crawler-editor {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.admin-crawler-editor__side {
|
||||
position: static;
|
||||
}
|
||||
|
||||
.admin-crawler-action-panel__buttons {
|
||||
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 760px) {
|
||||
.admin-crawler-overview {
|
||||
grid-template-columns: repeat(2, minmax(0, 1fr));
|
||||
}
|
||||
|
||||
.admin-crawler-list__head {
|
||||
align-items: stretch;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.admin-crawler-list__head .admin-btn {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.admin-crawler-row {
|
||||
grid-template-columns: 1fr;
|
||||
padding: var(--space-4);
|
||||
}
|
||||
|
||||
.admin-crawler-row__main {
|
||||
grid-template-columns: 38px minmax(0, 1fr) auto;
|
||||
}
|
||||
|
||||
.admin-crawler-row__main .admin-status {
|
||||
justify-self: start;
|
||||
grid-column: 2 / 3;
|
||||
}
|
||||
|
||||
.admin-crawler-row__chevron {
|
||||
grid-column: 3 / 4;
|
||||
grid-row: 1 / 3;
|
||||
}
|
||||
|
||||
.admin-crawler-params {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.admin-crawler-script-name {
|
||||
grid-template-columns: 1fr;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
.admin-crawler-row__actions .admin-btn {
|
||||
flex: 1 1 120px;
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 520px) {
|
||||
.admin-crawler-overview {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.admin-crawler-section,
|
||||
.admin-crawler-action-panel,
|
||||
.admin-crawler-side-panel {
|
||||
padding: var(--space-4);
|
||||
}
|
||||
}
|
||||
|
||||
/* ----- Storage summary ----- */
|
||||
.admin-storage-summary {
|
||||
display: grid;
|
||||
@@ -475,7 +1010,7 @@
|
||||
|
||||
.admin-crawler-import {
|
||||
display: grid;
|
||||
grid-template-columns: auto minmax(180px, 1fr) auto;
|
||||
grid-template-columns: auto minmax(180px, 1fr) auto auto;
|
||||
gap: var(--space-2);
|
||||
align-items: center;
|
||||
}
|
||||
@@ -494,6 +1029,87 @@
|
||||
line-height: var(--line-relaxed);
|
||||
}
|
||||
|
||||
.admin-crawler-test-result {
|
||||
display: grid;
|
||||
gap: var(--space-3);
|
||||
margin-top: var(--space-3);
|
||||
padding: var(--space-3);
|
||||
border: 1px solid var(--border-subtle);
|
||||
border-radius: var(--radius-sm);
|
||||
background: var(--bg-elevated);
|
||||
}
|
||||
|
||||
.admin-crawler-test-result.is-ok {
|
||||
border-color: var(--success);
|
||||
}
|
||||
|
||||
.admin-crawler-test-result.is-error {
|
||||
border-color: var(--danger);
|
||||
}
|
||||
|
||||
.admin-crawler-test-result__head {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: var(--space-2);
|
||||
flex-wrap: wrap;
|
||||
font-size: var(--font-xs);
|
||||
color: var(--text-muted);
|
||||
}
|
||||
|
||||
.admin-crawler-test-result__error {
|
||||
padding: var(--space-2) var(--space-3);
|
||||
border-radius: var(--radius-sm);
|
||||
background: var(--danger-soft);
|
||||
color: var(--danger);
|
||||
font-size: var(--font-sm);
|
||||
line-height: var(--line-relaxed);
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
.admin-crawler-test-result__grid {
|
||||
display: grid;
|
||||
gap: var(--space-2);
|
||||
}
|
||||
|
||||
.admin-crawler-test-result__field,
|
||||
.admin-crawler-test-result__media {
|
||||
display: grid;
|
||||
grid-template-columns: 82px minmax(0, 1fr);
|
||||
gap: var(--space-2);
|
||||
align-items: baseline;
|
||||
font-size: var(--font-xs);
|
||||
}
|
||||
|
||||
.admin-crawler-test-result__field span,
|
||||
.admin-crawler-test-result__media span {
|
||||
color: var(--text-faint);
|
||||
}
|
||||
|
||||
.admin-crawler-test-result__field strong,
|
||||
.admin-crawler-test-result__media strong {
|
||||
color: var(--text-strong);
|
||||
font-weight: var(--weight-medium);
|
||||
min-width: 0;
|
||||
overflow-wrap: anywhere;
|
||||
}
|
||||
|
||||
.admin-crawler-test-result__log {
|
||||
font-size: var(--font-xs);
|
||||
color: var(--text-muted);
|
||||
}
|
||||
|
||||
.admin-crawler-test-result__log summary {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.admin-crawler-test-result__log pre {
|
||||
margin: var(--space-2) 0 0;
|
||||
max-height: 180px;
|
||||
overflow: auto;
|
||||
white-space: pre-wrap;
|
||||
color: var(--text-muted);
|
||||
}
|
||||
|
||||
.admin-p123-qr {
|
||||
display: grid;
|
||||
gap: var(--space-3);
|
||||
|
||||
@@ -221,18 +221,29 @@ test("crawler management is a separate admin section", () => {
|
||||
assert.match(crawlerPageSource, /api\.deleteCrawler/);
|
||||
assert.match(crawlerPageSource, /api\.importCrawlerScriptFile/);
|
||||
assert.match(crawlerPageSource, /api\.importCrawlerScriptURL/);
|
||||
assert.match(crawlerPageSource, /api\.testCrawlerScript/);
|
||||
assert.match(crawlerPageSource, /type="file"/);
|
||||
assert.match(crawlerPageSource, /链接导入/);
|
||||
assert.match(crawlerPageSource, /测试脚本/);
|
||||
assert.match(crawlerPageSource, /测试通过/);
|
||||
assert.doesNotMatch(crawlerPageSource, /新建脚本/);
|
||||
assert.doesNotMatch(crawlerPageSource, /爬虫 ID/);
|
||||
assert.doesNotMatch(crawlerPageSource, /crawler-id/);
|
||||
assert.doesNotMatch(crawlerPageSource, /crawler-name/);
|
||||
assert.doesNotMatch(crawlerPageSource, /脚本路径/);
|
||||
assert.doesNotMatch(crawlerPageSource, /Python 解释器/);
|
||||
assert.doesNotMatch(crawlerPageSource, /自定义配置 JSON/);
|
||||
assert.doesNotMatch(crawlerPageSource, /Bot/);
|
||||
assert.match(crawlerPageSource, /builtin:\s*"spider91"/);
|
||||
// 项目不再内置任何爬虫:不允许出现内置 91 预设
|
||||
assert.doesNotMatch(crawlerPageSource, /builtin/);
|
||||
assert.doesNotMatch(crawlerPageSource, /内置 91/);
|
||||
assert.match(apiSource, /type AdminCrawler/);
|
||||
assert.match(apiSource, /"\/crawlers"/);
|
||||
assert.match(apiSource, /"\/crawlers\/import-file"/);
|
||||
assert.match(apiSource, /"\/crawlers\/import-url"/);
|
||||
assert.match(apiSource, /"\/crawlers\/test-script"/);
|
||||
assert.match(apiSource, /type CrawlerDryRunResult/);
|
||||
assert.match(apiSource, /id\?: string/);
|
||||
assert.match(apiSource, /new FormData\(\)/);
|
||||
assert.doesNotMatch(driveFormSource, /scriptcrawler/);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user