feat(crawler): simplify script crawler workflow

Redesign crawler management around imported Python scripts instead of built-in crawler storage. Crawler scripts now declare CRAWLER_NAME, imports validate metadata, crawler IDs are generated internally, and deleted crawler scripts are detached without deleting already imported videos.

Add backend support for file and URL script imports, dry-run testing, metadata parsing, safer job paths, original filename preservation, and crawler listing that ignores detached script records. Remove the legacy built-in Spider91 script path flow and hidden Python/config JSON fields from the crawler API.

Rework the admin crawler page into an independent crawler console with script import, dry-run testing, status metrics, spider iconography, and simplified controls. Update docs, examples, installer checks, Docker/release packaging, and tests for the new protocol.
This commit is contained in:
nianzhibai
2026-06-10 14:22:47 +08:00
parent ec5a01b6aa
commit c1355385e1
24 changed files with 2355 additions and 274 deletions
+1
View File
@@ -154,6 +154,7 @@ OUTPUT_FILE = "91porn_videos.json"
MAX_PAGES = None # 设置为 None 爬取所有页,或设置整数如 5 只爬前5页
RESUME = True # 是否跳过输出文件中已存在的 viewkey (断点续爬)
MAX_EMPTY_PAGES = 2 # 连续空页数达到此值时停止爬取
CRAWLER_NAME = "91Porn"
CRAWLER_PROTOCOL = "crawler.v1"
# ===================================================
-1
View File
@@ -48,7 +48,6 @@ WORKDIR /opt/video-site-91
COPY --from=backend /out/server ./server
COPY --from=frontend /app/dist ./dist
COPY backend/config.example.yaml ./config.example.yaml
COPY 91VideoSpider/ ./91VideoSpider/
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
ARG VERSION=dev
+1 -1
View File
@@ -23,7 +23,7 @@
- **多后端支持** — 兼容 115 云盘、PikPak 云盘、123云盘、OneDrive、Google Drive 和本地存储
- **低带宽播放** — 115 云盘、PikPak 云盘、123云盘、OneDrive 都支持302模式,在线播放视频时,不占用服务器带宽,播放体验不受服务器带宽影响;Google Drive 不支持302模式,走服务器中转,观看体验会受服务器带宽影响
- **封面 & 预览片段** — 自动为每个视频生成封面图和预览片段,首页快速选片
- **91 爬虫** — 内置爬虫,支持抓取 91 本月最热视频
- **脚本爬虫** — 内置任何爬虫,支持在后台导入自定义爬虫脚本(上传 `.py` 文件 / 链接导入 / 服务器路径),按统一协议抓取视频
- **双主题** — 黑黄经典主题 / 粉白清新主题,随时切换
- **短视频模式** — 一键切换抖音风格,沉浸刷片
- **低资源占用** — 2C2G 服务器稳定运行,主要性能消耗就是封面图和预览视频的生成
+1 -1
View File
@@ -84,7 +84,7 @@ go run ./cmd/server 后端 9192
爬虫现在是独立后台栏目 `/admin/crawlers`,不再作为“网盘/存储类型”配置。脚本负责发现视频,后端负责去重、下载、入库、封面、预览视频和视频指纹。
脚本协议见 [docs/crawler-protocol.md](../docs/crawler-protocol.md)。后台支持上传 `.py` 文件或通过 HTTP(S) 脚本链接导入,导入后的脚本会保存到数据目录旁的 `crawler-scripts/`内置 91 爬虫也支持同一套 `crawler.v1` job 协议;后台“内置 91”会自动使用仓库里的 `91VideoSpider/spider_91porn.py`
脚本协议见 [docs/crawler-protocol.md](../docs/crawler-protocol.md)。后台支持上传 `.py` 文件或通过 HTTP(S) 脚本链接导入,导入后的脚本会保存到数据目录旁的 `crawler-scripts/`脚本必须声明 `CRAWLER_NAME`,后台会自动读取它作为爬虫名称。项目不内置任何爬虫脚本,所有爬虫都由用户自行导入
## 添加一个盘
+1 -31
View File
@@ -239,9 +239,6 @@ func main() {
SetSpider91UploadDriveID: func(id string) error {
return app.SetSpider91UploadDriveID(ctx, id)
},
DefaultSpider91ScriptPath: func() string {
return app.defaultSpider91ScriptPath()
},
OnRunNightlyJob: func() bool {
if app.nightlyRunner != nil {
return app.nightlyRunner.TriggerNow()
@@ -881,30 +878,6 @@ func (a *App) commonThumbsDir() string {
return filepath.Join(a.cfg.Storage.LocalPreviewDir, "thumbs")
}
// defaultSpider91ScriptPath 推断仓库里爬虫脚本的默认路径。
// 当前进程从 backend/ 启动时,脚本位于 ../91VideoSpider/spider_91porn.py。
// 找不到时返回空字符串,上层会在 RunOnce 时报错提示用户手动填 script_path。
func (a *App) defaultSpider91ScriptPath() string {
candidates := []string{
// 优先从配置目录的父目录定位
filepath.Join(filepath.Dir(filepath.Dir(a.cfg.Storage.LocalPreviewDir)), "91VideoSpider", "spider_91porn.py"),
// 仓库 rootcwd 在 backend/ 时)
filepath.Join("..", "91VideoSpider", "spider_91porn.py"),
// cwd 已经是仓库 root 时
filepath.Join("91VideoSpider", "spider_91porn.py"),
}
for _, p := range candidates {
abs, err := filepath.Abs(p)
if err != nil {
continue
}
if _, err := os.Stat(abs); err == nil {
return abs
}
}
return ""
}
// attachScriptCrawler 创建通用脚本爬虫 runner,并注册到 a.scriptCrawlers。
func (a *App) attachScriptCrawler(d *catalog.Drive, drv *scriptcrawler.Driver) {
pythonPath := strings.TrimSpace(d.Credentials["python_path"])
@@ -913,9 +886,6 @@ func (a *App) attachScriptCrawler(d *catalog.Drive, drv *scriptcrawler.Driver) {
}
scriptPath := strings.TrimSpace(d.Credentials["script_path"])
sourceKind := scriptCrawlerSourceKindForDrive(d)
if scriptPath == "" && sourceKind == spider91.Kind {
scriptPath = a.defaultSpider91ScriptPath()
}
proxyURL := strings.TrimSpace(d.Credentials["proxy"])
configJSON := strings.TrimSpace(d.Credentials["config_json"])
workDir := ""
@@ -2442,7 +2412,7 @@ func (a *App) listSpider91DriveIDs(ctx context.Context) []string {
}
out := make([]string, 0, len(all))
for _, d := range all {
if d != nil && d.Kind == scriptcrawler.Kind {
if d != nil && d.Kind == scriptcrawler.Kind && strings.TrimSpace(d.Credentials["script_path"]) != "" {
out = append(out, d.ID)
}
}
+2 -1
View File
@@ -609,7 +609,8 @@ func TestNightlyTargetsComeFromCatalogBeforeDriveAttach(t *testing.T) {
{ID: "115", Kind: "p115", Name: "115", RootID: "0", TeaserEnabled: true},
{ID: "pikpak", Kind: "pikpak", Name: "PikPak", RootID: "0", TeaserEnabled: true},
{ID: "91-legacy", Kind: "spider91", Name: "91 Legacy", RootID: "0", TeaserEnabled: true},
{ID: "91-crawler", Kind: scriptcrawler.Kind, Name: "91 Spider", RootID: "/", TeaserEnabled: true},
{ID: "91-crawler", Kind: scriptcrawler.Kind, Name: "91 Spider", RootID: "/", Credentials: map[string]string{"script_path": "/tmp/crawler.py"}, TeaserEnabled: true},
{ID: "91-crawler-deleted", Kind: scriptcrawler.Kind, Name: "Deleted Spider", RootID: "/", Credentials: map[string]string{}, TeaserEnabled: true},
} {
if err := cat.UpsertDrive(ctx, d); err != nil {
t.Fatalf("seed drive %s: %v", d.ID, err)
+205 -69
View File
@@ -68,9 +68,6 @@ type AdminServer struct {
// Spider91 → 115/123/PikPak/OneDrive 上传目标 drive ID 读写
GetSpider91UploadDriveID func() string
SetSpider91UploadDriveID func(driveID string) error
// DefaultSpider91ScriptPath returns the built-in Spider91 crawler script
// path for the independent crawler management UI.
DefaultSpider91ScriptPath func() string
// OnRunNightlyJob 触发一次完整的凌晨流水线(Phase1 扫盘 + Phase2 91 爬虫 +
// Phase3 迁移)。立即返回 —— 实际任务在后台跑,admin 在日志或下次状态查询里
// 看进度。若流水线正在跑或已排队,Runner 会拒绝重复触发。
@@ -163,6 +160,7 @@ func (a *AdminServer) Register(r chi.Router) {
r.Post("/crawlers", a.handleUpsertCrawler)
r.Post("/crawlers/import-file", a.handleImportCrawlerScriptFile)
r.Post("/crawlers/import-url", a.handleImportCrawlerScriptURL)
r.Post("/crawlers/test-script", a.handleTestCrawlerScript)
r.Delete("/crawlers/{id}", a.handleDeleteCrawler)
r.Post("/crawlers/{id}/run", a.handleRunCrawler)
r.Post("/crawlers/{id}/tasks/stop", a.handleStopCrawlerTasks)
@@ -441,11 +439,6 @@ func (a *AdminServer) handleListDrives(w http.ResponseWriter, r *http.Request) {
// LastCrawlAt 是 spider91 上次成功爬取的 unix 秒(来自 credentials.last_crawl_at)。
// 其它 kind 留 0;前端用它显示"上次抓取: N 小时前"。
Spider91Proxy string `json:"spider91Proxy,omitempty"`
ScriptCrawlerPythonPath string `json:"scriptCrawlerPythonPath,omitempty"`
ScriptCrawlerScriptPath string `json:"scriptCrawlerScriptPath,omitempty"`
ScriptCrawlerProxy string `json:"scriptCrawlerProxy,omitempty"`
ScriptCrawlerTargetNew string `json:"scriptCrawlerTargetNew,omitempty"`
ScriptCrawlerConfigJSON string `json:"scriptCrawlerConfigJson,omitempty"`
LastCrawlAt int64 `json:"lastCrawlAt,omitempty"`
GoogleDriveUseOnlineAPI *bool `json:"googleDriveUseOnlineAPI,omitempty"`
ScanGenerationStatus GenerationStatus `json:"scanGenerationStatus"`
@@ -513,11 +506,6 @@ func (a *AdminServer) handleListDrives(w http.ResponseWriter, r *http.Request) {
TeaserEnabled: d.TeaserEnabled,
SkipDirIDs: append([]string{}, d.SkipDirIDs...),
Spider91Proxy: spider91ProxyForDrive(d),
ScriptCrawlerPythonPath: scriptCrawlerCred(d, "python_path"),
ScriptCrawlerScriptPath: scriptCrawlerCred(d, "script_path"),
ScriptCrawlerProxy: scriptCrawlerCred(d, "proxy"),
ScriptCrawlerTargetNew: scriptCrawlerCred(d, "target_new"),
ScriptCrawlerConfigJSON: scriptCrawlerCred(d, "config_json"),
LastCrawlAt: lastCrawlAt,
GoogleDriveUseOnlineAPI: googleDriveUseOnlineAPIForDrive(d),
ScanGenerationStatus: generation.Scan,
@@ -637,14 +625,11 @@ type crawlerDTO struct {
ID string `json:"id"`
Name string `json:"name"`
Kind string `json:"kind"`
Builtin string `json:"builtin,omitempty"`
Status string `json:"status"`
LastError string `json:"lastError,omitempty"`
ScriptPath string `json:"scriptPath"`
PythonPath string `json:"pythonPath,omitempty"`
Proxy string `json:"proxy,omitempty"`
TargetNew string `json:"targetNew,omitempty"`
ConfigJSON string `json:"configJson,omitempty"`
LastCrawlAt int64 `json:"lastCrawlAt,omitempty"`
ScanGenerationStatus GenerationStatus `json:"scanGenerationStatus"`
ThumbnailGenerationStatus GenerationStatus `json:"thumbnailGenerationStatus"`
@@ -663,13 +648,9 @@ type crawlerDTO struct {
type upsertCrawlerReq struct {
ID string `json:"id"`
Name string `json:"name"`
Builtin string `json:"builtin"`
ScriptPath string `json:"scriptPath"`
PythonPath string `json:"pythonPath"`
Proxy string `json:"proxy"`
TargetNew string `json:"targetNew"`
ConfigJSON string `json:"configJson"`
}
func (a *AdminServer) handleListCrawlers(w http.ResponseWriter, r *http.Request) {
@@ -700,7 +681,7 @@ func (a *AdminServer) handleListCrawlers(w http.ResponseWriter, r *http.Request)
out := []crawlerDTO{}
for _, d := range all {
if d == nil || !isCrawlerDriveKind(d.Kind) {
if d == nil || !isConfiguredCrawlerDrive(d) {
continue
}
out = append(out, a.crawlerDTOForDrive(d, teaserCounts[d.ID], thumbnailCounts[d.ID], fingerprintCounts[d.ID], generationStatuses[d.ID]))
@@ -729,16 +710,13 @@ func (a *AdminServer) crawlerDTOForDrive(d *catalog.Drive, teaser catalog.DriveT
}
return crawlerDTO{
ID: d.ID,
Name: d.Name,
Name: crawlerNameForDrive(d),
Kind: d.Kind,
Builtin: crawlerBuiltinForDrive(d),
Status: d.Status,
LastError: d.LastError,
ScriptPath: strings.TrimSpace(d.Credentials["script_path"]),
PythonPath: strings.TrimSpace(d.Credentials["python_path"]),
Proxy: strings.TrimSpace(d.Credentials["proxy"]),
TargetNew: strings.TrimSpace(d.Credentials["target_new"]),
ConfigJSON: strings.TrimSpace(d.Credentials["config_json"]),
LastCrawlAt: lastCrawlAt,
ScanGenerationStatus: generation.Scan,
ThumbnailGenerationStatus: generation.Thumbnail,
@@ -756,11 +734,16 @@ func (a *AdminServer) crawlerDTOForDrive(d *catalog.Drive, teaser catalog.DriveT
}
}
func crawlerBuiltinForDrive(d *catalog.Drive) string {
func crawlerNameForDrive(d *catalog.Drive) string {
if d == nil {
return ""
}
return strings.TrimSpace(d.Credentials["builtin"])
if d.Credentials != nil {
if meta, err := scriptcrawler.ReadMetadata(strings.TrimSpace(d.Credentials["script_path"])); err == nil {
return meta.Name
}
}
return strings.TrimSpace(d.Name)
}
func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request) {
@@ -770,32 +753,21 @@ func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request
return
}
id := strings.TrimSpace(body.ID)
name := strings.TrimSpace(body.Name)
if id == "" || name == "" {
http.Error(w, "id and name are required", http.StatusBadRequest)
return
}
existing, _ := a.Catalog.GetDrive(r.Context(), id)
creds := map[string]string{}
var existing *catalog.Drive
if id != "" {
existing, _ = a.Catalog.GetDrive(r.Context(), id)
}
if existing != nil {
for k, v := range existing.Credentials {
creds[k] = v
}
}
builtin := strings.TrimSpace(body.Builtin)
if builtin != "" {
creds["builtin"] = builtin
}
scriptPath := strings.TrimSpace(body.ScriptPath)
if scriptPath == "" && builtin == "spider91" && a.DefaultSpider91ScriptPath != nil {
scriptPath = strings.TrimSpace(a.DefaultSpider91ScriptPath())
}
incoming := map[string]string{
"script_path": scriptPath,
"python_path": strings.TrimSpace(body.PythonPath),
"proxy": strings.TrimSpace(body.Proxy),
"target_new": strings.TrimSpace(body.TargetNew),
"config_json": strings.TrimSpace(body.ConfigJSON),
}
for k, v := range incoming {
creds[k] = v
@@ -805,8 +777,19 @@ func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
if builtin != "" {
merged["builtin"] = builtin
meta, err := scriptcrawler.ReadMetadata(merged["script_path"])
if err != nil {
http.Error(w, "脚本元信息无效:"+err.Error(), http.StatusBadRequest)
return
}
name := meta.Name
if id == "" {
generatedID, err := a.generateCrawlerID(r.Context(), name)
if err != nil {
writeErr(w, http.StatusInternalServerError, err)
return
}
id = generatedID
}
d := &catalog.Drive{
ID: id,
@@ -826,11 +809,55 @@ func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request
}
if a.OnDriveSaved != nil {
if err := a.OnDriveSaved(id); err != nil {
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "warning": err.Error()})
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "id": id, "warning": err.Error()})
return
}
}
writeJSON(w, http.StatusOK, map[string]any{"ok": true})
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "id": id})
}
func (a *AdminServer) generateCrawlerID(ctx context.Context, name string) (string, error) {
all, err := a.Catalog.ListDrives(ctx)
if err != nil {
return "", err
}
used := map[string]bool{}
for _, d := range all {
if d == nil {
continue
}
if isCrawlerDriveKind(d.Kind) && strings.TrimSpace(d.Credentials["script_path"]) == "" {
continue
}
used[d.ID] = true
}
slug := crawlerIDSlug(name)
base := "crawler"
if slug != "" {
base += "-" + slug
}
candidate := base
for suffix := 2; used[candidate]; suffix++ {
candidate = fmt.Sprintf("%s-%d", base, suffix)
}
return candidate, nil
}
func crawlerIDSlug(raw string) string {
var b strings.Builder
lastDash := false
for _, r := range strings.ToLower(raw) {
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
b.WriteRune(r)
lastDash = false
continue
}
if b.Len() > 0 && !lastDash {
b.WriteByte('-')
lastDash = true
}
}
return strings.Trim(b.String(), "-")
}
type importCrawlerScriptURLReq struct {
@@ -838,6 +865,36 @@ type importCrawlerScriptURLReq struct {
FileName string `json:"fileName"`
}
type testCrawlerScriptReq struct {
ScriptPath string `json:"scriptPath"`
Proxy string `json:"proxy"`
}
// handleTestCrawlerScript 试跑一个爬虫脚本:不入库,抓到第一条视频
// (并探测直链可达)即返回,让用户在保存前确认脚本能爬到视频。
func (a *AdminServer) handleTestCrawlerScript(w http.ResponseWriter, r *http.Request) {
var body testCrawlerScriptReq
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
writeErr(w, http.StatusBadRequest, err)
return
}
scriptPath := strings.TrimSpace(body.ScriptPath)
if scriptPath == "" {
http.Error(w, "请先导入爬虫脚本", http.StatusBadRequest)
return
}
proxyURL, err := normalizeCrawlerProxyURL(body.Proxy, "脚本爬虫")
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
result := scriptcrawler.DryRun(r.Context(), scriptcrawler.DryRunConfig{
ScriptPath: scriptPath,
ProxyURL: proxyURL,
})
writeJSON(w, http.StatusOK, result)
}
func (a *AdminServer) handleImportCrawlerScriptFile(w http.ResponseWriter, r *http.Request) {
r.Body = http.MaxBytesReader(w, r.Body, maxCrawlerScriptBytes+1024*1024)
if err := r.ParseMultipartForm(maxCrawlerScriptBytes + 1024*1024); err != nil {
@@ -860,7 +917,13 @@ func (a *AdminServer) handleImportCrawlerScriptFile(w http.ResponseWriter, r *ht
writeErr(w, http.StatusBadRequest, err)
return
}
writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath})
meta, err := scriptcrawler.ReadMetadata(scriptPath)
if err != nil {
_ = os.Remove(scriptPath)
writeErr(w, http.StatusBadRequest, fmt.Errorf("脚本元信息无效: %w", err))
return
}
writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath, "name": meta.Name})
}
func (a *AdminServer) handleImportCrawlerScriptURL(w http.ResponseWriter, r *http.Request) {
@@ -917,7 +980,13 @@ func (a *AdminServer) handleImportCrawlerScriptURL(w http.ResponseWriter, r *htt
writeErr(w, http.StatusBadRequest, err)
return
}
writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath})
meta, err := scriptcrawler.ReadMetadata(scriptPath)
if err != nil {
_ = os.Remove(scriptPath)
writeErr(w, http.StatusBadRequest, fmt.Errorf("脚本元信息无效: %w", err))
return
}
writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath, "name": meta.Name})
}
func (a *AdminServer) saveCrawlerScript(ctx context.Context, name string, r io.Reader, maxBytes int64) (string, error) {
@@ -935,7 +1004,7 @@ func (a *AdminServer) saveCrawlerScript(ctx context.Context, name string, r io.R
if err := os.MkdirAll(root, 0o755); err != nil {
return "", err
}
dst := filepath.Join(root, time.Now().UTC().Format("20060102T150405.000000000Z")+"-"+fileName)
dst := filepath.Join(root, fileName)
dstAbs, err := filepath.Abs(dst)
if err != nil {
return "", err
@@ -1015,6 +1084,11 @@ func safeCrawlerScriptFileName(raw string) (string, error) {
func (a *AdminServer) handleRunCrawler(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
d, err := a.Catalog.GetDrive(r.Context(), id)
if err != nil || d == nil || !isCrawlerDriveKind(d.Kind) || d.Credentials == nil || strings.TrimSpace(d.Credentials["script_path"]) == "" {
http.Error(w, "crawler not found", http.StatusNotFound)
return
}
status := a.nightlyJobStatus()
if status.Running || status.Queued {
writeJSON(w, http.StatusAccepted, map[string]any{
@@ -1041,13 +1115,86 @@ func (a *AdminServer) handleStopCrawlerTasks(w http.ResponseWriter, r *http.Requ
}
func (a *AdminServer) handleDeleteCrawler(w http.ResponseWriter, r *http.Request) {
a.handleDeleteDrive(w, r)
id := chi.URLParam(r, "id")
d, err := a.Catalog.GetDrive(r.Context(), id)
if err != nil {
writeErr(w, http.StatusNotFound, err)
return
}
if !isCrawlerDriveKind(d.Kind) {
http.Error(w, "crawler not found", http.StatusNotFound)
return
}
if a.OnStopDriveTasks != nil {
a.OnStopDriveTasks(id)
}
deletedScript, scriptErr := a.removeImportedCrawlerScript(d)
if d.Credentials == nil {
d.Credentials = map[string]string{}
}
delete(d.Credentials, "script_path")
delete(d.Credentials, "proxy")
delete(d.Credentials, "target_new")
delete(d.Credentials, "builtin")
delete(d.Credentials, "python_path")
delete(d.Credentials, "config_json")
d.Status = "disconnected"
d.LastError = ""
if err := a.Catalog.UpsertDrive(r.Context(), d); err != nil {
writeErr(w, http.StatusInternalServerError, err)
return
}
resp := map[string]any{
"ok": true,
"deletedVideos": 0,
"deletedScript": deletedScript,
}
if scriptErr != nil {
resp["warning"] = scriptErr.Error()
}
writeJSON(w, http.StatusOK, resp)
}
func isCrawlerDriveKind(kind string) bool {
return kind == scriptcrawler.Kind
}
func isConfiguredCrawlerDrive(d *catalog.Drive) bool {
return d != nil &&
isCrawlerDriveKind(d.Kind) &&
d.Credentials != nil &&
strings.TrimSpace(d.Credentials["script_path"]) != ""
}
func (a *AdminServer) removeImportedCrawlerScript(d *catalog.Drive) (bool, error) {
if d == nil || d.Credentials == nil {
return false, nil
}
scriptPath := strings.TrimSpace(d.Credentials["script_path"])
if scriptPath == "" {
return false, nil
}
scriptAbs, err := filepath.Abs(scriptPath)
if err != nil {
return false, err
}
rootAbs, err := a.crawlerScriptImportDir()
if err != nil {
return false, err
}
if scriptAbs == rootAbs || !strings.HasPrefix(scriptAbs, rootAbs+string(os.PathSeparator)) {
return false, nil
}
if err := os.Remove(scriptAbs); err != nil {
if errors.Is(err, os.ErrNotExist) {
return false, nil
}
return false, err
}
return true, nil
}
func spider91ProxyForDrive(d *catalog.Drive) string {
if d == nil || d.Kind != "spider91" || d.Credentials == nil {
return ""
@@ -1055,13 +1202,6 @@ func spider91ProxyForDrive(d *catalog.Drive) string {
return strings.TrimSpace(d.Credentials["proxy"])
}
func scriptCrawlerCred(d *catalog.Drive, key string) string {
if d == nil || d.Kind != scriptcrawler.Kind || d.Credentials == nil {
return ""
}
return strings.TrimSpace(d.Credentials[key])
}
func googleDriveUseOnlineAPIForDrive(d *catalog.Drive) *bool {
if d == nil || d.Kind != "googledrive" {
return nil
@@ -1165,23 +1305,16 @@ func mergeScriptCrawlerCredentials(existing *catalog.Drive, incoming map[string]
return nil, fmt.Errorf("脚本爬虫 target_new 必须是正整数")
}
merged[key] = strconv.Itoa(n)
case "config_json":
case "script_path":
if value == "" {
delete(merged, key)
continue
}
if !json.Valid([]byte(value)) {
return nil, fmt.Errorf("脚本爬虫自定义配置必须是合法 JSON")
}
merged[key] = value
case "python_path", "script_path":
if value == "" {
if existing == nil || key == "script_path" {
if existing == nil {
delete(merged, key)
}
continue
}
merged[key] = value
case "builtin", "python_path", "config_json":
delete(merged, key)
default:
if value == "" {
delete(merged, key)
@@ -1190,9 +1323,12 @@ func mergeScriptCrawlerCredentials(existing *catalog.Drive, incoming map[string]
}
}
}
if strings.TrimSpace(merged["script_path"]) == "" && !strings.EqualFold(strings.TrimSpace(merged["builtin"]), "spider91") {
if strings.TrimSpace(merged["script_path"]) == "" {
return nil, fmt.Errorf("脚本爬虫必须填写 script_path")
}
delete(merged, "builtin")
delete(merged, "python_path")
delete(merged, "config_json")
return merged, nil
}
+341 -27
View File
@@ -5,10 +5,12 @@ import (
"context"
"database/sql"
"encoding/json"
"errors"
"mime/multipart"
"net/http"
"net/http/httptest"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
@@ -19,6 +21,7 @@ import (
"github.com/video-site/backend/internal/auth"
"github.com/video-site/backend/internal/catalog"
"github.com/video-site/backend/internal/drives/scriptcrawler"
)
func TestHandleLoginReturnsForbiddenForBannedIP(t *testing.T) {
@@ -843,7 +846,8 @@ func TestHandleDeleteDriveRequiresCleanupConfirmation(t *testing.T) {
func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
ctx := context.Background()
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
tmp := t.TempDir()
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
if err != nil {
t.Fatalf("open catalog: %v", err)
}
@@ -852,6 +856,10 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
t.Fatalf("close catalog: %v", err)
}
})
scriptPath := filepath.Join(tmp, "spider_91porn.py")
if err := os.WriteFile(scriptPath, []byte("CRAWLER_NAME = \"91Porn\"\n"), 0o644); err != nil {
t.Fatalf("write crawler script: %v", err)
}
for _, d := range []*catalog.Drive{
{
@@ -862,7 +870,7 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
Credentials: map[string]string{
"last_crawl_at": "1800000000",
"proxy": " http://127.0.0.1:7890 ",
"script_path": "/opt/video-site-91/91VideoSpider/spider_91porn.py",
"script_path": scriptPath,
},
Status: "ok",
},
@@ -875,7 +883,7 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
"builtin": "spider91",
"last_crawl_at": "1800000000",
"proxy": " http://127.0.0.1:7890 ",
"script_path": "/opt/video-site-91/91VideoSpider/spider_91porn.py",
"script_path": scriptPath,
},
Status: "ok",
},
@@ -889,6 +897,14 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
},
Status: "ok",
},
{
ID: "crawler-script-deleted",
Kind: "scriptcrawler",
Name: "Deleted Script",
RootID: "/",
Credentials: map[string]string{},
Status: "disconnected",
},
} {
if err := cat.UpsertDrive(ctx, d); err != nil {
t.Fatalf("seed drive %s: %v", d.ID, err)
@@ -905,8 +921,8 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
var got []struct {
ID string `json:"id"`
Name string `json:"name"`
Kind string `json:"kind"`
Builtin string `json:"builtin"`
Proxy string `json:"proxy"`
LastCrawlAt int64 `json:"lastCrawlAt"`
}
@@ -914,24 +930,30 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
t.Fatalf("decode: %v", err)
}
byID := map[string]struct {
Name string
Kind string
Builtin string
Proxy string
LastCrawlAt int64
}{}
for _, d := range got {
byID[d.ID] = struct {
Name string
Kind string
Builtin string
Proxy string
LastCrawlAt int64
}{Kind: d.Kind, Builtin: d.Builtin, Proxy: d.Proxy, LastCrawlAt: d.LastCrawlAt}
}{Name: d.Name, Kind: d.Kind, Proxy: d.Proxy, LastCrawlAt: d.LastCrawlAt}
}
if _, ok := byID["spider91-main"]; ok {
t.Fatal("legacy spider91 drive should not be returned by crawler list")
}
if byID["crawler-spider91"].Kind != "scriptcrawler" || byID["crawler-spider91"].Builtin != "spider91" {
t.Fatalf("crawler kind/builtin = %q/%q, want scriptcrawler/spider91", byID["crawler-spider91"].Kind, byID["crawler-spider91"].Builtin)
if _, ok := byID["crawler-script-deleted"]; ok {
t.Fatal("crawler without script_path should not be returned by crawler list")
}
if byID["crawler-spider91"].Kind != "scriptcrawler" {
t.Fatalf("crawler kind = %q, want scriptcrawler", byID["crawler-spider91"].Kind)
}
if byID["crawler-spider91"].Name != "91Porn" {
t.Fatalf("crawler name = %q, want script metadata name", byID["crawler-spider91"].Name)
}
if byID["crawler-spider91"].Proxy != "http://127.0.0.1:7890" {
t.Fatalf("crawler proxy = %q, want trimmed proxy", byID["crawler-spider91"].Proxy)
@@ -967,9 +989,10 @@ func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
}
}
func TestHandleUpsertCrawlerAllowsBuiltinSpider91WithoutScriptPath(t *testing.T) {
func TestHandleUpsertCrawlerRequiresScriptPath(t *testing.T) {
ctx := context.Background()
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
tmp := t.TempDir()
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
if err != nil {
t.Fatalf("open catalog: %v", err)
}
@@ -979,21 +1002,35 @@ func TestHandleUpsertCrawlerAllowsBuiltinSpider91WithoutScriptPath(t *testing.T)
}
})
srv := &AdminServer{Catalog: cat}
scriptPath := filepath.Join(tmp, "custom.py")
if err := os.WriteFile(scriptPath, []byte("CRAWLER_NAME = \"91 Spider\"\n"), 0o644); err != nil {
t.Fatalf("write crawler script: %v", err)
}
// 不再内置任何爬虫:没有脚本路径的保存请求必须被拒绝,
// 旧的 builtin 字段也不再有"免脚本"特权。
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers", strings.NewReader(`{
"id": "spider91-main",
"name": "91 Spider",
"builtin": "spider91",
"scriptPath": "",
"pythonPath": "python3",
"targetNew": "15"
}`))
rr := httptest.NewRecorder()
(&AdminServer{
Catalog: cat,
DefaultSpider91ScriptPath: func() string {
return ""
},
}).handleUpsertCrawler(rr, req)
srv.handleUpsertCrawler(rr, req)
if rr.Code != http.StatusBadRequest {
t.Fatalf("status = %d, body = %s, want 400", rr.Code, rr.Body.String())
}
// 带脚本路径时正常保存,且请求中的 builtin 字段被忽略,不会写入凭证。
req = httptest.NewRequest(http.MethodPost, "/admin/api/crawlers", strings.NewReader(`{
"id": "spider91-main",
"builtin": "spider91",
"scriptPath": "`+scriptPath+`",
"targetNew": "15"
}`))
rr = httptest.NewRecorder()
srv.handleUpsertCrawler(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
}
@@ -1002,23 +1039,85 @@ func TestHandleUpsertCrawlerAllowsBuiltinSpider91WithoutScriptPath(t *testing.T)
if err != nil {
t.Fatalf("get crawler drive: %v", err)
}
if got.Kind != "scriptcrawler" || got.Credentials["builtin"] != "spider91" {
t.Fatalf("kind/builtin = %q/%q, want scriptcrawler/spider91", got.Kind, got.Credentials["builtin"])
if got.Kind != "scriptcrawler" || got.Credentials["builtin"] != "" {
t.Fatalf("kind/builtin = %q/%q, want scriptcrawler with no builtin credential", got.Kind, got.Credentials["builtin"])
}
if got.Credentials["script_path"] != "" {
t.Fatalf("script_path = %q, want empty when default is unavailable", got.Credentials["script_path"])
if got.Credentials["python_path"] != "" || got.Credentials["config_json"] != "" {
t.Fatalf("legacy hidden credentials should not be saved: %+v", got.Credentials)
}
if got.Name != "91 Spider" {
t.Fatalf("name = %q, want script metadata name", got.Name)
}
if got.Credentials["script_path"] != scriptPath {
t.Fatalf("script_path = %q, want %q", got.Credentials["script_path"], scriptPath)
}
}
func TestHandleUpsertCrawlerGeneratesIDFromScriptName(t *testing.T) {
ctx := context.Background()
tmp := t.TempDir()
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
if err != nil {
t.Fatalf("open catalog: %v", err)
}
t.Cleanup(func() {
if err := cat.Close(); err != nil {
t.Fatalf("close catalog: %v", err)
}
})
if err := cat.UpsertDrive(ctx, &catalog.Drive{
ID: "crawler-my-spider",
Kind: scriptcrawler.Kind,
Name: "Existing",
RootID: "/",
Credentials: map[string]string{"script_path": "/opt/crawlers/existing.py"},
}); err != nil {
t.Fatalf("seed crawler: %v", err)
}
scriptPath := filepath.Join(tmp, "custom.py")
if err := os.WriteFile(scriptPath, []byte("CRAWLER_NAME = \"My Spider\"\n"), 0o644); err != nil {
t.Fatalf("write crawler script: %v", err)
}
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers", strings.NewReader(`{
"scriptPath": "`+scriptPath+`",
"targetNew": "15"
}`))
rr := httptest.NewRecorder()
(&AdminServer{Catalog: cat}).handleUpsertCrawler(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
}
var resp struct {
OK bool `json:"ok"`
ID string `json:"id"`
}
if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil {
t.Fatalf("decode response: %v", err)
}
if !resp.OK || resp.ID != "crawler-my-spider-2" {
t.Fatalf("response = %+v, want generated suffix id", resp)
}
got, err := cat.GetDrive(ctx, resp.ID)
if err != nil {
t.Fatalf("get generated crawler: %v", err)
}
if got.Name != "My Spider" || got.Kind != scriptcrawler.Kind {
t.Fatalf("generated crawler = %+v", got)
}
}
func TestHandleImportCrawlerScriptFile(t *testing.T) {
tmp := t.TempDir()
script := "CRAWLER_NAME = \"Demo Crawler\"\nprint('crawler')\n"
var body bytes.Buffer
mw := multipart.NewWriter(&body)
part, err := mw.CreateFormFile("file", "../demo crawler.py")
if err != nil {
t.Fatalf("create form file: %v", err)
}
if _, err := part.Write([]byte("print('crawler')\n")); err != nil {
if _, err := part.Write([]byte(script)); err != nil {
t.Fatalf("write part: %v", err)
}
if err := mw.Close(); err != nil {
@@ -1034,6 +1133,7 @@ func TestHandleImportCrawlerScriptFile(t *testing.T) {
}
var got struct {
ScriptPath string `json:"scriptPath"`
Name string `json:"name"`
}
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
t.Fatalf("decode: %v", err)
@@ -1045,15 +1145,48 @@ func TestHandleImportCrawlerScriptFile(t *testing.T) {
if filepath.Ext(got.ScriptPath) != ".py" {
t.Fatalf("script path = %q, want .py", got.ScriptPath)
}
if filepath.Base(got.ScriptPath) != "demo_crawler.py" {
t.Fatalf("script filename = %q, want original sanitized filename", filepath.Base(got.ScriptPath))
}
data, err := os.ReadFile(got.ScriptPath)
if err != nil {
t.Fatalf("read imported script: %v", err)
}
if string(data) != "print('crawler')\n" {
if got.Name != "Demo Crawler" {
t.Fatalf("name = %q, want script metadata name", got.Name)
}
if string(data) != script {
t.Fatalf("script content = %q", string(data))
}
}
func TestHandleImportCrawlerScriptFileRejectsMissingName(t *testing.T) {
tmp := t.TempDir()
var body bytes.Buffer
mw := multipart.NewWriter(&body)
part, err := mw.CreateFormFile("file", "crawler.py")
if err != nil {
t.Fatalf("create form file: %v", err)
}
if _, err := part.Write([]byte("print('crawler')\n")); err != nil {
t.Fatalf("write part: %v", err)
}
if err := mw.Close(); err != nil {
t.Fatalf("close multipart: %v", err)
}
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/import-file", &body)
req.Header.Set("Content-Type", mw.FormDataContentType())
rr := httptest.NewRecorder()
(&AdminServer{LocalPreviewDir: filepath.Join(tmp, "previews")}).handleImportCrawlerScriptFile(rr, req)
if rr.Code != http.StatusBadRequest {
t.Fatalf("status = %d, want 400; body = %s", rr.Code, rr.Body.String())
}
if !strings.Contains(rr.Body.String(), "CRAWLER_NAME") {
t.Fatalf("body = %s, want CRAWLER_NAME error", rr.Body.String())
}
}
func TestHandleImportCrawlerScriptFileRejectsNonPython(t *testing.T) {
tmp := t.TempDir()
var body bytes.Buffer
@@ -1088,7 +1221,7 @@ func TestHandleImportCrawlerScriptURL(t *testing.T) {
http.NotFound(w, r)
return
}
_, _ = w.Write([]byte("# crawler from url\n"))
_, _ = w.Write([]byte("CRAWLER_NAME = \"URL Crawler\"\n# crawler from url\n"))
}))
defer upstream.Close()
@@ -1102,6 +1235,7 @@ func TestHandleImportCrawlerScriptURL(t *testing.T) {
}
var got struct {
ScriptPath string `json:"scriptPath"`
Name string `json:"name"`
}
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
t.Fatalf("decode: %v", err)
@@ -1114,11 +1248,116 @@ func TestHandleImportCrawlerScriptURL(t *testing.T) {
if err != nil {
t.Fatalf("read imported script: %v", err)
}
if string(data) != "# crawler from url\n" {
if got.Name != "URL Crawler" {
t.Fatalf("name = %q, want script metadata name", got.Name)
}
if filepath.Base(got.ScriptPath) != "crawler.py" {
t.Fatalf("script filename = %q, want original filename", filepath.Base(got.ScriptPath))
}
if string(data) != "CRAWLER_NAME = \"URL Crawler\"\n# crawler from url\n" {
t.Fatalf("script content = %q", string(data))
}
}
func TestHandleDeleteCrawlerRemovesImportedScript(t *testing.T) {
ctx := context.Background()
tmp := t.TempDir()
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
if err != nil {
t.Fatalf("open catalog: %v", err)
}
t.Cleanup(func() {
if err := cat.Close(); err != nil {
t.Fatalf("close catalog: %v", err)
}
})
scriptDir := filepath.Join(tmp, "crawler-scripts")
if err := os.MkdirAll(scriptDir, 0o755); err != nil {
t.Fatalf("mkdir script dir: %v", err)
}
scriptPath := filepath.Join(scriptDir, "crawler.py")
if err := os.WriteFile(scriptPath, []byte("CRAWLER_NAME = \"Delete Me\"\n"), 0o644); err != nil {
t.Fatalf("write script: %v", err)
}
if err := cat.UpsertDrive(ctx, &catalog.Drive{
ID: "crawler-delete-me",
Kind: scriptcrawler.Kind,
Name: "Delete Me",
RootID: "/",
Credentials: map[string]string{
"script_path": scriptPath,
"proxy": "http://127.0.0.1:7890",
"target_new": "10",
},
}); err != nil {
t.Fatalf("seed crawler: %v", err)
}
now := time.Now()
if err := cat.UpsertVideo(ctx, &catalog.Video{
ID: "video-from-crawler",
DriveID: "crawler-delete-me",
FileID: "video.mp4",
Title: "Keep Me",
PublishedAt: now,
CreatedAt: now,
UpdatedAt: now,
}); err != nil {
t.Fatalf("seed video: %v", err)
}
req := httptest.NewRequest(http.MethodDelete, "/admin/api/crawlers/crawler-delete-me", nil)
rctx := chi.NewRouteContext()
rctx.URLParams.Add("id", "crawler-delete-me")
req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
rr := httptest.NewRecorder()
stopped := false
(&AdminServer{
Catalog: cat,
LocalPreviewDir: filepath.Join(tmp, "previews"),
OnDriveDeleteCleanup: func(context.Context, string) (int, error) {
t.Fatal("crawler delete must not delete imported videos")
return 0, nil
},
OnStopDriveTasks: func(driveID string) bool {
stopped = driveID == "crawler-delete-me"
return true
},
}).handleDeleteCrawler(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
}
if _, err := os.Stat(scriptPath); !errors.Is(err, os.ErrNotExist) {
t.Fatalf("script stat error = %v, want not exist", err)
}
if !stopped {
t.Fatal("stop hook was not called")
}
drive, err := cat.GetDrive(ctx, "crawler-delete-me")
if err != nil {
t.Fatalf("crawler drive should remain for existing videos: %v", err)
}
if drive.Credentials["script_path"] != "" || drive.Credentials["proxy"] != "" || drive.Credentials["target_new"] != "" {
t.Fatalf("crawler credentials were not cleared: %+v", drive.Credentials)
}
if _, err := cat.GetVideo(ctx, "video-from-crawler"); err != nil {
t.Fatalf("imported video should remain: %v", err)
}
var got struct {
OK bool `json:"ok"`
DeletedVideos int `json:"deletedVideos"`
DeletedScript bool `json:"deletedScript"`
}
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
t.Fatalf("decode: %v", err)
}
if !got.OK || got.DeletedVideos != 0 || !got.DeletedScript {
t.Fatalf("response = %#v", got)
}
}
func TestHandleImportCrawlerScriptURLRejectsNonPython(t *testing.T) {
tmp := t.TempDir()
upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -1143,6 +1382,81 @@ func TestHandleImportCrawlerScriptURLRejectsNonPython(t *testing.T) {
}
}
func TestHandleTestCrawlerScriptRunsImportedScript(t *testing.T) {
if _, err := exec.LookPath("python3"); err != nil {
t.Skip("python3 is required for crawler script dry-run")
}
media := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/video.mp4" {
http.NotFound(w, r)
return
}
w.Header().Set("Content-Type", "video/mp4")
if r.Header.Get("Range") == "bytes=0-0" {
w.Header().Set("Content-Range", "bytes 0-0/2048")
w.WriteHeader(http.StatusPartialContent)
_, _ = w.Write([]byte{0})
return
}
_, _ = w.Write([]byte("video"))
}))
defer media.Close()
script := filepath.Join(t.TempDir(), "crawler.py")
body := `import json
print(json.dumps({"title": "Dry Run Video", "source_id": "dry-1", "media_url": "` + media.URL + `/video.mp4", "thumbnail_url": "` + media.URL + `/thumb.jpg", "detail_url": "` + media.URL + `/detail"}))
`
if err := os.WriteFile(script, []byte(body), 0o755); err != nil {
t.Fatalf("write script: %v", err)
}
reqBody, err := json.Marshal(map[string]string{
"scriptPath": script,
})
if err != nil {
t.Fatalf("marshal request: %v", err)
}
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/test-script", bytes.NewReader(reqBody))
rr := httptest.NewRecorder()
(&AdminServer{}).handleTestCrawlerScript(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
}
var got struct {
OK bool `json:"ok"`
Items []struct {
Title string `json:"title"`
SourceID string `json:"sourceId"`
MediaURL string `json:"mediaUrl"`
} `json:"items"`
MediaCheck *struct {
OK bool `json:"ok"`
Status int `json:"status"`
ContentType string `json:"contentType"`
ContentLength int64 `json:"contentLengthBytes"`
} `json:"mediaCheck"`
}
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
t.Fatalf("decode: %v", err)
}
if !got.OK {
t.Fatalf("ok = false, body = %s", rr.Body.String())
}
if len(got.Items) != 1 || got.Items[0].Title != "Dry Run Video" || got.Items[0].SourceID != "dry-1" {
t.Fatalf("items = %#v", got.Items)
}
if got.Items[0].MediaURL != media.URL+"/video.mp4" {
t.Fatalf("mediaUrl = %q", got.Items[0].MediaURL)
}
if got.MediaCheck == nil || !got.MediaCheck.OK || got.MediaCheck.Status != http.StatusPartialContent {
t.Fatalf("mediaCheck = %#v", got.MediaCheck)
}
if got.MediaCheck.ContentLength != 2048 {
t.Fatalf("contentLength = %d, want 2048", got.MediaCheck.ContentLength)
}
}
func TestHandleListDrivesIncludesGoogleDriveOnlineAPIMode(t *testing.T) {
ctx := context.Background()
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
@@ -273,12 +273,16 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err
}
emit(CrawlProgress{})
if err := os.MkdirAll(c.cfg.Driver.CrawlDir(), 0o755); err != nil {
crawlDir, err := filepath.Abs(c.cfg.Driver.CrawlDir())
if err != nil {
return result, fmt.Errorf("scriptcrawler: resolve crawl dir: %w", err)
}
if err := os.MkdirAll(crawlDir, 0o755); err != nil {
return result, err
}
runID := time.Now().UTC().Format("20060102T150405Z")
seenPath := filepath.Join(c.cfg.Driver.CrawlDir(), "seen-"+runID+".txt")
jobPath := filepath.Join(c.cfg.Driver.CrawlDir(), "job-"+runID+".json")
seenPath := filepath.Join(crawlDir, "seen-"+runID+".txt")
jobPath := filepath.Join(crawlDir, "job-"+runID+".json")
result.SeenFile = seenPath
result.JobFile = jobPath
@@ -412,6 +416,10 @@ func (c *Crawler) writeJobFile(path, runID string, targetNew int, seenPath strin
}
cfg = json.RawMessage(raw)
}
outputDir, err := filepath.Abs(c.cfg.Driver.OutputDir())
if err != nil {
return fmt.Errorf("resolve output dir: %w", err)
}
job := Job{
Protocol: "crawler.v1",
Mode: "crawl",
@@ -419,7 +427,7 @@ func (c *Crawler) writeJobFile(path, runID string, targetNew int, seenPath strin
CrawlerID: c.cfg.Driver.ID(),
TargetNew: targetNew,
SeenSourceIDsFile: seenPath,
OutputDir: c.cfg.Driver.OutputDir(),
OutputDir: outputDir,
Config: cfg,
Network: JobNetwork{ProxyURL: strings.TrimSpace(c.cfg.ProxyURL)},
}
@@ -135,6 +135,58 @@ func TestCrawlerRunOnceUsesSourceKindNamespace(t *testing.T) {
}
}
func TestCrawlerRunOncePassesAbsoluteJobPathsWhenWorkDirDiffers(t *testing.T) {
ctx := context.Background()
tmp := t.TempDir()
t.Chdir(tmp)
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
if err != nil {
t.Fatalf("open catalog: %v", err)
}
t.Cleanup(func() {
if err := cat.Close(); err != nil {
t.Fatalf("close catalog: %v", err)
}
})
drv := New(Config{ID: "demo", RootDir: filepath.Join("data", "crawler")})
if err := drv.Init(ctx); err != nil {
t.Fatalf("driver init: %v", err)
}
scriptDir := filepath.Join(tmp, "scripts")
if err := os.MkdirAll(scriptDir, 0o755); err != nil {
t.Fatalf("mkdir script dir: %v", err)
}
dummyScript := filepath.Join(scriptDir, "helper-script")
if err := os.WriteFile(dummyScript, []byte("helper"), 0o755); err != nil {
t.Fatalf("write dummy script: %v", err)
}
wrapper := filepath.Join(tmp, "helper-wrapper.sh")
wrapperScript := fmt.Sprintf("#!/bin/sh\nexec %q -test.run=TestScriptCrawlerHelperProcess \"$@\"\n", os.Args[0])
if err := os.WriteFile(wrapper, []byte(wrapperScript), 0o755); err != nil {
t.Fatalf("write helper wrapper: %v", err)
}
t.Setenv("GO_WANT_SCRIPTCRAWLER_HELPER", "1")
t.Setenv("GO_WANT_SCRIPTCRAWLER_ASSERT_ABS", "1")
c := NewCrawler(CrawlerConfig{
Driver: drv,
Catalog: cat,
PythonPath: wrapper,
ScriptPath: dummyScript,
WorkDir: scriptDir,
})
res, err := c.RunOnce(ctx, 1)
if err != nil {
t.Fatalf("run once: %v", err)
}
if res.NewVideos != 1 || res.Skipped != 0 || res.Failed != 0 {
t.Fatalf("result = new:%d skipped:%d failed:%d, want 1/0/0", res.NewVideos, res.Skipped, res.Failed)
}
if !filepath.IsAbs(res.JobFile) || !filepath.IsAbs(res.SeenFile) {
t.Fatalf("result paths should be absolute: job=%q seen=%q", res.JobFile, res.SeenFile)
}
}
func TestCrawlerRunOnceImportsSimpleMediaURLWithoutSourceID(t *testing.T) {
ctx := context.Background()
tmp := t.TempDir()
@@ -241,6 +293,12 @@ func TestScriptCrawlerHelperProcess(t *testing.T) {
fmt.Fprintln(os.Stderr, err)
os.Exit(2)
}
if os.Getenv("GO_WANT_SCRIPTCRAWLER_ASSERT_ABS") == "1" {
if !filepath.IsAbs(jobPath) || !filepath.IsAbs(job.SeenSourceIDsFile) || !filepath.IsAbs(job.OutputDir) {
fmt.Fprintf(os.Stderr, "expected absolute paths, got job=%q seen=%q output=%q\n", jobPath, job.SeenSourceIDsFile, job.OutputDir)
os.Exit(2)
}
}
if os.Getenv("GO_WANT_SCRIPTCRAWLER_SIMPLE") == "1" {
event := map[string]any{
"title": "Simple Protocol Video",
@@ -0,0 +1,375 @@
package scriptcrawler
import (
"bufio"
"context"
"encoding/json"
"fmt"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"syscall"
"time"
)
// DryRun 在不入库的前提下试跑一个爬虫脚本:临时目录里生成 job.json,
// 启动脚本进程,拿到第一条(或前 MaxItems 条)item 事件后立即停止,
// 再对视频直链做一次小范围探测,验证脚本"能不能爬取到视频"。
// 用于后台导入脚本后的"测试脚本"按钮。
const (
defaultDryRunTimeout = 2 * time.Minute
dryRunLogTailLines = 60
dryRunMediaProbeLimit = 20 * time.Second
)
type DryRunConfig struct {
PythonPath string
ScriptPath string
ProxyURL string
ConfigJSON string
// MaxItems 收到多少条 item 后停止脚本,默认 1。
MaxItems int
// Timeout 整个试跑的硬上限,默认 2 分钟。
Timeout time.Duration
// SkipMediaProbe 跳过视频直链可达性探测(单测注入用)。
SkipMediaProbe bool
HTTPClient *http.Client
}
type DryRunItem struct {
Title string `json:"title"`
SourceID string `json:"sourceId,omitempty"`
MediaURL string `json:"mediaUrl,omitempty"`
MediaLocalFile string `json:"mediaLocalFile,omitempty"`
ThumbnailURL string `json:"thumbnailUrl,omitempty"`
DetailURL string `json:"detailUrl,omitempty"`
}
type DryRunMediaCheck struct {
OK bool `json:"ok"`
Status int `json:"status,omitempty"`
ContentType string `json:"contentType,omitempty"`
ContentLength int64 `json:"contentLengthBytes,omitempty"`
Error string `json:"error,omitempty"`
}
type DryRunResult struct {
OK bool `json:"ok"`
Items []DryRunItem `json:"items"`
MediaCheck *DryRunMediaCheck `json:"mediaCheck,omitempty"`
Error string `json:"error,omitempty"`
Log []string `json:"log,omitempty"`
DurationMs int64 `json:"durationMs"`
}
func DryRun(ctx context.Context, cfg DryRunConfig) *DryRunResult {
started := time.Now()
result := &DryRunResult{Items: []DryRunItem{}}
defer func() { result.DurationMs = time.Since(started).Milliseconds() }()
scriptPath := strings.TrimSpace(cfg.ScriptPath)
if scriptPath == "" {
result.Error = "脚本路径为空,请先导入脚本"
return result
}
if _, err := os.Stat(scriptPath); err != nil {
result.Error = fmt.Sprintf("脚本不存在: %v", err)
return result
}
pythonPath := strings.TrimSpace(cfg.PythonPath)
if pythonPath == "" {
pythonPath = "python3"
}
maxItems := cfg.MaxItems
if maxItems <= 0 {
maxItems = 1
}
timeout := cfg.Timeout
if timeout <= 0 {
timeout = defaultDryRunTimeout
}
tmpDir, err := os.MkdirTemp("", "crawler-dryrun-")
if err != nil {
result.Error = fmt.Sprintf("创建临时目录失败: %v", err)
return result
}
defer os.RemoveAll(tmpDir)
outputDir := filepath.Join(tmpDir, "output")
if err := os.MkdirAll(outputDir, 0o755); err != nil {
result.Error = fmt.Sprintf("创建输出目录失败: %v", err)
return result
}
seenPath := filepath.Join(tmpDir, "seen.txt")
if err := os.WriteFile(seenPath, nil, 0o644); err != nil {
result.Error = fmt.Sprintf("写入 seen 文件失败: %v", err)
return result
}
configJSON := json.RawMessage([]byte("{}"))
if raw := strings.TrimSpace(cfg.ConfigJSON); raw != "" {
if !json.Valid([]byte(raw)) {
result.Error = "自定义配置必须是合法 JSON"
return result
}
configJSON = json.RawMessage(raw)
}
job := Job{
Protocol: "crawler.v1",
Mode: "crawl",
RunID: "dryrun-" + started.UTC().Format("20060102T150405Z"),
CrawlerID: "dryrun",
TargetNew: maxItems,
SeenSourceIDsFile: seenPath,
OutputDir: outputDir,
Config: configJSON,
Network: JobNetwork{ProxyURL: strings.TrimSpace(cfg.ProxyURL)},
}
jobPath := filepath.Join(tmpDir, "job.json")
jobData, err := json.MarshalIndent(job, "", " ")
if err != nil {
result.Error = fmt.Sprintf("生成 job 文件失败: %v", err)
return result
}
if err := os.WriteFile(jobPath, jobData, 0o600); err != nil {
result.Error = fmt.Sprintf("写入 job 文件失败: %v", err)
return result
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
cmd := exec.CommandContext(runCtx, pythonPath, scriptPath, "--job", jobPath)
cmd.Dir = filepath.Dir(scriptPath)
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
cmd.Cancel = func() error {
return killDryRunProcess(cmd)
}
// 超时或提前 kill 后,脚本派生的子进程可能仍持有 stdout/stderr 管道;
// WaitDelay 强制在宽限期后关闭管道,避免读取端永久阻塞。
cmd.WaitDelay = 3 * time.Second
if proxyURL := strings.TrimSpace(cfg.ProxyURL); proxyURL != "" {
cmd.Env = append(os.Environ(),
"HTTP_PROXY="+proxyURL,
"HTTPS_PROXY="+proxyURL,
"http_proxy="+proxyURL,
"https_proxy="+proxyURL,
"NO_PROXY=",
"no_proxy=",
)
}
stdout, err := cmd.StdoutPipe()
if err != nil {
result.Error = fmt.Sprintf("启动脚本失败: %v", err)
return result
}
stderr, err := cmd.StderrPipe()
if err != nil {
_ = stdout.Close()
result.Error = fmt.Sprintf("启动脚本失败: %v", err)
return result
}
if err := cmd.Start(); err != nil {
_ = stdout.Close()
_ = stderr.Close()
result.Error = fmt.Sprintf("启动脚本失败: %v", err)
return result
}
// stderr 是脚本日志,保留尾部若干行用于排错回显。
var logMu sync.Mutex
logTail := make([]string, 0, dryRunLogTailLines)
stderrDone := make(chan struct{})
go func() {
defer close(stderrDone)
scanner := bufio.NewScanner(stderr)
scanner.Buffer(make([]byte, 64*1024), 1024*1024)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
logMu.Lock()
if len(logTail) >= dryRunLogTailLines {
logTail = logTail[1:]
}
logTail = append(logTail, line)
logMu.Unlock()
}
}()
items := []DryRunItem{}
var firstMediaHeaders map[string]string
parseFailures := 0
scanner := bufio.NewScanner(stdout)
scanner.Buffer(make([]byte, 64*1024), 4*1024*1024)
for scanner.Scan() {
if runCtx.Err() != nil {
break
}
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
var event Event
if err := json.Unmarshal([]byte(line), &event); err != nil {
parseFailures++
continue
}
eventType := strings.ToLower(strings.TrimSpace(event.Type))
item := event.normalizedItem()
if eventType == "" && item.hasPayload() {
eventType = "item"
}
if eventType != "item" {
continue
}
normalized, _, err := normalizeItemForImport(item)
if err != nil {
result.Error = fmt.Sprintf("item 字段不完整: %v", err)
continue
}
mediaURL := strings.TrimSpace(normalized.Media.URL)
if len(items) == 0 {
firstMediaHeaders = normalized.Media.Headers
}
items = append(items, DryRunItem{
Title: strings.TrimSpace(normalized.Title),
SourceID: strings.TrimSpace(item.SourceID),
MediaURL: mediaURL,
MediaLocalFile: strings.TrimSpace(normalized.Media.LocalFile),
ThumbnailURL: strings.TrimSpace(normalized.Thumbnail.URL),
DetailURL: strings.TrimSpace(normalized.DetailURL),
})
if len(items) >= maxItems {
break
}
}
// 拿够了就停掉脚本,避免它继续翻页。
_ = killDryRunProcess(cmd)
_ = cmd.Wait()
<-stderrDone
logMu.Lock()
result.Log = append([]string{}, logTail...)
logMu.Unlock()
result.Items = items
if len(items) == 0 {
if result.Error == "" {
switch {
case runCtx.Err() != nil && ctx.Err() == nil:
result.Error = fmt.Sprintf("测试超时(%s),脚本没有输出任何视频", timeout)
case parseFailures > 0:
result.Error = "脚本 stdout 不是合法的 crawler.v1 JSON Lines(日志应输出到 stderr"
default:
result.Error = "脚本退出但没有输出任何视频"
}
}
return result
}
result.Error = ""
first := items[0]
switch {
case cfg.SkipMediaProbe:
result.OK = true
case first.MediaLocalFile != "":
// 脚本自己下载到 output_dir 的模式:试跑用的是临时目录,
// 文件已随目录清理,能输出合法 local_file 即视为通过。
result.OK = true
default:
check := probeMediaURL(ctx, cfg, first, firstMediaHeaders)
result.MediaCheck = check
result.OK = check.OK
}
return result
}
func killDryRunProcess(cmd *exec.Cmd) error {
if cmd == nil || cmd.Process == nil {
return nil
}
if err := syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL); err != nil {
if err == syscall.ESRCH {
return nil
}
return cmd.Process.Kill()
}
return nil
}
// probeMediaURL 对视频直链发一个 Range: bytes=0-0 的小请求,
// 验证直链可达(带上脚本给的防盗链 headers 和代理)。
func probeMediaURL(ctx context.Context, cfg DryRunConfig, item DryRunItem, mediaHeaders map[string]string) *DryRunMediaCheck {
check := &DryRunMediaCheck{}
if item.MediaURL == "" {
check.Error = "item 没有视频直链"
return check
}
client := cfg.HTTPClient
if client == nil {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
ResponseHeaderTimeout: dryRunMediaProbeLimit,
}
if err := configureExplicitProxy(transport, cfg.ProxyURL); err != nil {
check.Error = fmt.Sprintf("代理配置无效: %v", err)
return check
}
client = &http.Client{Transport: transport}
}
probeCtx, cancel := context.WithTimeout(ctx, dryRunMediaProbeLimit)
defer cancel()
req, err := http.NewRequestWithContext(probeCtx, http.MethodGet, item.MediaURL, nil)
if err != nil {
check.Error = fmt.Sprintf("视频直链无效: %v", err)
return check
}
req.Header.Set("User-Agent", defaultUserAgent)
req.Header.Set("Range", "bytes=0-0")
if item.DetailURL != "" {
req.Header.Set("Referer", item.DetailURL)
}
for k, v := range mediaHeaders {
k = strings.TrimSpace(k)
if k == "" {
continue
}
req.Header.Set(k, v)
}
resp, err := client.Do(req)
if err != nil {
check.Error = fmt.Sprintf("视频直链请求失败: %v", err)
return check
}
defer resp.Body.Close()
check.Status = resp.StatusCode
check.ContentType = resp.Header.Get("Content-Type")
if cr := resp.Header.Get("Content-Range"); cr != "" {
// Content-Range: bytes 0-0/12345 → 取总大小
if idx := strings.LastIndex(cr, "/"); idx >= 0 {
var total int64
if _, err := fmt.Sscanf(cr[idx+1:], "%d", &total); err == nil {
check.ContentLength = total
}
}
}
if check.ContentLength == 0 && resp.StatusCode == http.StatusOK {
check.ContentLength = resp.ContentLength
}
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusPartialContent {
check.Error = fmt.Sprintf("视频直链返回 HTTP %d", resp.StatusCode)
return check
}
check.OK = true
return check
}
@@ -0,0 +1,153 @@
package scriptcrawler
import (
"context"
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"time"
)
func writeDryRunScript(t *testing.T, body string) string {
t.Helper()
dir := t.TempDir()
path := filepath.Join(dir, "crawler.sh")
if err := os.WriteFile(path, []byte("#!/bin/sh\n"+body), 0o755); err != nil {
t.Fatalf("write script: %v", err)
}
return path
}
func TestDryRunCollectsFirstItem(t *testing.T) {
script := writeDryRunScript(t, `
echo '[log] fetching list page' >&2
echo '{"type":"item","item":{"title":"Test Video","media_url":"https://cdn.example.test/v.mp4","source_id":"123","thumbnail_url":"https://cdn.example.test/t.jpg"}}'
echo '{"type":"done","stats":{"emitted":1}}'
`)
result := DryRun(context.Background(), DryRunConfig{
PythonPath: "/bin/sh",
ScriptPath: script,
SkipMediaProbe: true,
})
if !result.OK {
t.Fatalf("ok = false, error = %q, log = %v", result.Error, result.Log)
}
if len(result.Items) != 1 {
t.Fatalf("items = %d, want 1", len(result.Items))
}
item := result.Items[0]
if item.Title != "Test Video" || item.MediaURL != "https://cdn.example.test/v.mp4" || item.SourceID != "123" {
t.Fatalf("item = %+v", item)
}
if len(result.Log) == 0 || !strings.Contains(result.Log[0], "fetching list page") {
t.Fatalf("log tail = %v, want stderr captured", result.Log)
}
}
func TestDryRunProbesMediaURL(t *testing.T) {
var gotRange, gotReferer string
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
gotRange = r.Header.Get("Range")
gotReferer = r.Header.Get("Referer")
w.Header().Set("Content-Type", "video/mp4")
w.Header().Set("Content-Range", "bytes 0-0/4096")
w.WriteHeader(http.StatusPartialContent)
_, _ = w.Write([]byte("x"))
}))
t.Cleanup(srv.Close)
script := writeDryRunScript(t, fmt.Sprintf(
`echo '{"type":"item","title":"Probe Video","media_url":"%s/v.mp4","detail_url":"https://example.test/view"}'`,
srv.URL,
))
result := DryRun(context.Background(), DryRunConfig{
PythonPath: "/bin/sh",
ScriptPath: script,
})
if !result.OK {
t.Fatalf("ok = false, error = %q, mediaCheck = %+v", result.Error, result.MediaCheck)
}
if result.MediaCheck == nil || !result.MediaCheck.OK {
t.Fatalf("mediaCheck = %+v, want ok", result.MediaCheck)
}
if result.MediaCheck.Status != http.StatusPartialContent || result.MediaCheck.ContentLength != 4096 {
t.Fatalf("mediaCheck = %+v, want 206 with total 4096", result.MediaCheck)
}
if gotRange != "bytes=0-0" || gotReferer != "https://example.test/view" {
t.Fatalf("probe headers range=%q referer=%q", gotRange, gotReferer)
}
}
func TestDryRunReportsBrokenMediaURL(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "forbidden", http.StatusForbidden)
}))
t.Cleanup(srv.Close)
script := writeDryRunScript(t, fmt.Sprintf(
`echo '{"type":"item","title":"Dead Link","media_url":"%s/v.mp4"}'`,
srv.URL,
))
result := DryRun(context.Background(), DryRunConfig{
PythonPath: "/bin/sh",
ScriptPath: script,
})
if result.OK {
t.Fatal("ok = true, want false for HTTP 403 media url")
}
if result.MediaCheck == nil || result.MediaCheck.OK || result.MediaCheck.Status != http.StatusForbidden {
t.Fatalf("mediaCheck = %+v, want failed 403", result.MediaCheck)
}
if len(result.Items) != 1 {
t.Fatalf("items = %d, want item still returned for debugging", len(result.Items))
}
}
func TestDryRunRejectsNonJSONStdout(t *testing.T) {
script := writeDryRunScript(t, `echo 'plain text progress output'`)
result := DryRun(context.Background(), DryRunConfig{
PythonPath: "/bin/sh",
ScriptPath: script,
SkipMediaProbe: true,
})
if result.OK {
t.Fatal("ok = true, want false for non-JSON stdout")
}
if !strings.Contains(result.Error, "JSON Lines") {
t.Fatalf("error = %q, want JSON Lines hint", result.Error)
}
}
func TestDryRunTimesOut(t *testing.T) {
script := writeDryRunScript(t, `sleep 30`)
start := time.Now()
result := DryRun(context.Background(), DryRunConfig{
PythonPath: "/bin/sh",
ScriptPath: script,
Timeout: 2 * time.Second,
SkipMediaProbe: true,
})
if result.OK {
t.Fatal("ok = true, want false on timeout")
}
if !strings.Contains(result.Error, "超时") {
t.Fatalf("error = %q, want timeout message", result.Error)
}
if elapsed := time.Since(start); elapsed > 10*time.Second {
t.Fatalf("dry run took %s, script was not killed", elapsed)
}
}
func TestDryRunMissingScript(t *testing.T) {
result := DryRun(context.Background(), DryRunConfig{
PythonPath: "/bin/sh",
ScriptPath: filepath.Join(t.TempDir(), "missing.py"),
})
if result.OK || result.Error == "" {
t.Fatalf("result = %+v, want error for missing script", result)
}
}
@@ -0,0 +1,117 @@
package scriptcrawler
import (
"errors"
"fmt"
"os"
"path/filepath"
"strings"
)
const maxCrawlerNameRunes = 80
type Metadata struct {
Name string `json:"name"`
}
func ReadMetadata(scriptPath string) (Metadata, error) {
scriptPath = strings.TrimSpace(scriptPath)
if scriptPath == "" {
return Metadata{}, errors.New("脚本路径为空")
}
if filepath.Ext(scriptPath) != ".py" {
return Metadata{}, errors.New("目前只支持 .py 爬虫脚本")
}
data, err := os.ReadFile(scriptPath)
if err != nil {
return Metadata{}, fmt.Errorf("读取脚本失败: %w", err)
}
return ExtractMetadata(string(data))
}
func ExtractMetadata(source string) (Metadata, error) {
for _, line := range strings.Split(source, "\n") {
trimmed := strings.TrimSpace(line)
if trimmed == "" || strings.HasPrefix(trimmed, "#") {
continue
}
if !strings.HasPrefix(trimmed, "CRAWLER_NAME") {
continue
}
left, right, ok := strings.Cut(trimmed, "=")
if !ok || strings.TrimSpace(left) != "CRAWLER_NAME" {
continue
}
name, ok := parsePythonStringLiteral(right)
if !ok {
return Metadata{}, errors.New(`CRAWLER_NAME 必须是字符串字面量,例如 CRAWLER_NAME = "示例爬虫"`)
}
name = strings.TrimSpace(name)
if name == "" {
return Metadata{}, errors.New("CRAWLER_NAME 不能为空")
}
if len([]rune(name)) > maxCrawlerNameRunes {
return Metadata{}, fmt.Errorf("CRAWLER_NAME 不能超过 %d 个字符", maxCrawlerNameRunes)
}
return Metadata{Name: name}, nil
}
return Metadata{}, errors.New(`脚本必须声明 CRAWLER_NAME,例如 CRAWLER_NAME = "示例爬虫"`)
}
func parsePythonStringLiteral(raw string) (string, bool) {
s := strings.TrimSpace(raw)
if s == "" {
return "", false
}
rawString := false
for len(s) > 0 {
switch s[0] {
case 'r', 'R':
rawString = true
s = strings.TrimSpace(s[1:])
case 'u', 'U', 'b', 'B':
s = strings.TrimSpace(s[1:])
default:
goto parseQuote
}
}
parseQuote:
if len(s) < 2 || (s[0] != '"' && s[0] != '\'') {
return "", false
}
quote := s[0]
var b strings.Builder
escaped := false
for i := 1; i < len(s); i++ {
ch := s[i]
if escaped {
switch {
case rawString:
b.WriteByte('\\')
b.WriteByte(ch)
case ch == 'n':
b.WriteByte('\n')
case ch == 'r':
b.WriteByte('\r')
case ch == 't':
b.WriteByte('\t')
case ch == '\\' || ch == quote || ch == '"' || ch == '\'':
b.WriteByte(ch)
default:
b.WriteByte(ch)
}
escaped = false
continue
}
if ch == '\\' {
escaped = true
continue
}
if ch == quote {
return b.String(), true
}
b.WriteByte(ch)
}
return "", false
}
@@ -0,0 +1,39 @@
package scriptcrawler
import (
"strings"
"testing"
)
func TestExtractMetadataReadsCrawlerName(t *testing.T) {
meta, err := ExtractMetadata(`
# comment
CRAWLER_NAME = "示例爬虫"
`)
if err != nil {
t.Fatalf("extract metadata: %v", err)
}
if meta.Name != "示例爬虫" {
t.Fatalf("name = %q", meta.Name)
}
}
func TestExtractMetadataRejectsMissingCrawlerName(t *testing.T) {
_, err := ExtractMetadata(`print("hello")`)
if err == nil {
t.Fatal("expected error")
}
if !strings.Contains(err.Error(), "CRAWLER_NAME") {
t.Fatalf("error = %v, want CRAWLER_NAME guidance", err)
}
}
func TestExtractMetadataRejectsEmptyCrawlerName(t *testing.T) {
_, err := ExtractMetadata(`CRAWLER_NAME = " "`)
if err == nil {
t.Fatal("expected error")
}
if !strings.Contains(err.Error(), "不能为空") {
t.Fatalf("error = %v, want empty-name error", err)
}
}
+4 -4
View File
@@ -134,9 +134,9 @@ apt_install() {
python3 python3-requests python3-bs4 python3-lxml python3-socks
}
verify_spider91_python_deps() {
command -v python3 >/dev/null 2>&1 || die "python3 is required for 91Spider"
python3 - <<'PY' || die "missing Python modules for 91Spider: requests, bs4, lxml, socks"
verify_crawler_python_deps() {
command -v python3 >/dev/null 2>&1 || die "python3 is required for crawler scripts"
python3 - <<'PY' || die "missing Python modules for crawler scripts: requests, bs4, lxml, socks"
import importlib.util
import sys
@@ -200,7 +200,7 @@ install_dependencies() {
install_go
command -v ffmpeg >/dev/null 2>&1 || die "ffmpeg is required"
command -v ffprobe >/dev/null 2>&1 || die "ffprobe is required"
verify_spider91_python_deps
verify_crawler_python_deps
}
ensure_ownership() {
+11 -3
View File
@@ -6,6 +6,14 @@ task status and cancellation.
## Invocation
Every script must declare a static crawler name near the top of the Python file.
The admin page reads this value when importing the script; users do not type the
crawler name manually.
```python
CRAWLER_NAME = "Example Crawler"
```
The backend runs:
```bash
@@ -35,12 +43,12 @@ python3 /path/to/crawler.py --job /path/to/job.json
## Importing Scripts
Crawler scripts are configured from the admin crawler page. A script can be
entered as an existing server path, uploaded as a local file, or imported from
an HTTP(S) URL.
uploaded as a local file or imported from an HTTP(S) URL.
Imported scripts are copied into `crawler-scripts/` next to the configured local
preview data directory. The import API currently accepts Python files only
(`.py`) and rejects empty files or files larger than 2 MiB.
(`.py`) and rejects empty files, files larger than 2 MiB, or scripts without
`CRAWLER_NAME`.
## Output
+2
View File
@@ -3,6 +3,8 @@ import argparse
import json
import sys
CRAWLER_NAME = "Demo Crawler"
def load_seen(path):
try:
+1 -1
View File
@@ -128,7 +128,7 @@ verify_runtime_deps() {
command -v "$cmd" >/dev/null 2>&1 || die "missing command: $cmd"
done
python3 - <<'PY' || die "missing Python modules for 91Spider: requests, bs4, lxml, socks"
python3 - <<'PY' || die "missing Python modules for crawler scripts: requests, bs4, lxml, socks"
import importlib.util
import sys
-2
View File
@@ -63,8 +63,6 @@ build_package() {
cp "$ROOT_DIR/backend/config.example.yaml" "$work/config.example.yaml"
cp "$ROOT_DIR/install.sh" "$work/install.sh"
cp -R "$ROOT_DIR/dist" "$work/dist"
mkdir -p "$work/91VideoSpider"
cp "$ROOT_DIR/91VideoSpider/spider_91porn.py" "$work/91VideoSpider/spider_91porn.py"
cat >"$work/README.txt" <<EOF
$APP_NAME $VERSION
+345 -96
View File
@@ -1,30 +1,41 @@
import { useEffect, useMemo, useState } from "react";
import { ArrowLeft, CircleStop, Download, Link as LinkIcon, Plus, Save, Trash2, Upload } from "lucide-react";
import { useEffect, useMemo, useState, type ReactNode } from "react";
import {
Activity,
ArrowLeft,
ChevronRight,
CircleStop,
Clock,
Download,
FileCode2,
Gauge,
Link as LinkIcon,
Plus,
RefreshCw,
Save,
Settings2,
TestTube,
Trash2,
Upload,
} from "lucide-react";
import * as api from "./api";
import { useToast } from "./ToastContext";
import { driveKindAbbr, generationStateClass, generationStateLabel } from "./drive/constants";
import { generationStateClass, generationStateLabel } from "./drive/constants";
import { SpiderIcon } from "./icons/SpiderIcon";
type CrawlerForm = {
id: string;
name: string;
builtin: string;
scriptPath: string;
pythonPath: string;
targetNew: string;
proxy: string;
configJson: string;
};
const emptyForm: CrawlerForm = {
id: "",
name: "",
builtin: "",
scriptPath: "",
pythonPath: "python3",
targetNew: "10",
proxy: "",
configJson: "",
};
export function CrawlersPage() {
@@ -37,6 +48,8 @@ export function CrawlersPage() {
const [stoppingId, setStoppingId] = useState("");
const [scriptURL, setScriptURL] = useState("");
const [importingScript, setImportingScript] = useState(false);
const [testingScript, setTestingScript] = useState(false);
const [testResult, setTestResult] = useState<api.CrawlerDryRunResult | null>(null);
const [mode, setMode] = useState<"list" | "detail">("list");
const { show } = useToast();
@@ -44,6 +57,15 @@ export function CrawlersPage() {
() => list.find((item) => item.id === selectedId) ?? null,
[list, selectedId]
);
const stats = useMemo(() => {
const running = list.filter((item) => item.scanGenerationStatus?.state === "scanning").length;
return {
total: list.length,
ready: list.filter((item) => item.status === "ok").length,
running,
error: list.filter((item) => item.status === "error").length,
};
}, [list]);
async function refresh() {
setLoading(true);
@@ -64,15 +86,13 @@ export function CrawlersPage() {
function selectCrawler(crawler: api.AdminCrawler) {
setSelectedId(crawler.id);
setMode("detail");
setTestResult(null);
setForm({
id: crawler.id,
name: crawler.name,
builtin: crawler.builtin ?? "",
scriptPath: crawler.scriptPath ?? "",
pythonPath: crawler.pythonPath || "python3",
targetNew: crawler.targetNew || (crawler.builtin === "spider91" || crawler.kind === "spider91" ? "15" : "10"),
targetNew: crawler.targetNew || "10",
proxy: crawler.proxy ?? "",
configJson: crawler.configJson ?? "",
});
}
@@ -80,20 +100,7 @@ export function CrawlersPage() {
setSelectedId("");
setForm(emptyForm);
setScriptURL("");
setMode("detail");
}
function createSpider91() {
setSelectedId("");
setForm({
...emptyForm,
id: "spider91",
name: "91 爬虫",
builtin: "spider91",
scriptPath: "",
targetNew: "15",
});
setScriptURL("");
setTestResult(null);
setMode("detail");
}
@@ -101,6 +108,7 @@ export function CrawlersPage() {
setSelectedId("");
setForm(emptyForm);
setScriptURL("");
setTestResult(null);
setMode("list");
}
@@ -110,33 +118,24 @@ export function CrawlersPage() {
async function save() {
const id = form.id.trim();
const name = form.name.trim();
if (!id || !name) {
show("请填写爬虫 ID 和名称", "error");
return;
}
if (!form.builtin && !form.scriptPath.trim()) {
if (!form.scriptPath.trim()) {
show("请先导入爬虫脚本", "error");
return;
}
setSaving(true);
try {
const resp = await api.upsertCrawler({
id,
name,
builtin: form.builtin,
id: id || undefined,
scriptPath: form.scriptPath.trim(),
pythonPath: form.pythonPath.trim(),
targetNew: form.targetNew.trim(),
proxy: form.proxy.trim(),
configJson: form.configJson.trim(),
});
if (resp.warning) {
show(`已保存,但初始化失败:${resp.warning}`, "error");
} else {
show("已保存", "success");
}
setSelectedId(id);
setSelectedId(resp.id || id);
await refresh();
setMode("list");
} catch (e) {
@@ -152,6 +151,8 @@ export function CrawlersPage() {
try {
const resp = await api.importCrawlerScriptFile(file);
set("scriptPath", resp.scriptPath);
set("name", resp.name);
setTestResult(null);
show("脚本已导入", "success");
} catch (e) {
show(e instanceof Error ? e.message : "导入失败", "error");
@@ -170,7 +171,9 @@ export function CrawlersPage() {
try {
const resp = await api.importCrawlerScriptURL(url);
set("scriptPath", resp.scriptPath);
set("name", resp.name);
setScriptURL("");
setTestResult(null);
show("脚本已导入", "success");
} catch (e) {
show(e instanceof Error ? e.message : "导入失败", "error");
@@ -179,6 +182,32 @@ export function CrawlersPage() {
}
}
async function testScript() {
const scriptPath = form.scriptPath.trim();
if (!scriptPath) {
show("请先导入爬虫脚本", "error");
return;
}
setTestingScript(true);
setTestResult(null);
try {
const result = await api.testCrawlerScript({
scriptPath,
proxy: form.proxy.trim(),
});
setTestResult(result);
if (result.ok) {
show("测试通过", "success");
} else {
show(crawlerTestFailure(result) || "测试失败", "error");
}
} catch (e) {
show(e instanceof Error ? e.message : "测试失败", "error");
} finally {
setTestingScript(false);
}
}
async function run(crawler: api.AdminCrawler) {
setRunningId(crawler.id);
try {
@@ -210,10 +239,16 @@ export function CrawlersPage() {
}
async function remove(crawler: api.AdminCrawler) {
if (!window.confirm(`删除爬虫 ${crawler.name} 并清理它导入的视频?`)) return;
if (!window.confirm(`删除爬虫 ${crawler.name} 的脚本和配置?已爬取的视频会保留。`)) return;
try {
const resp = await api.deleteCrawler(crawler.id);
show(`已删除,并清理 ${resp.deletedVideos ?? 0} 个视频`, "success");
if (resp.warning) {
show(`已删除爬虫配置,但脚本文件清理失败:${resp.warning}`, "error");
} else if (resp.deletedScript) {
show("已删除爬虫配置和脚本文件,已爬取视频保留", "success");
} else {
show("已删除爬虫配置,已爬取视频保留", "success");
}
setSelectedId("");
setForm(emptyForm);
setMode("list");
@@ -243,63 +278,74 @@ export function CrawlersPage() {
</header>
{mode === "list" ? (
<div className="admin-crawler-console">
<div className="admin-crawler-overview">
<CrawlerMetric label="已配置" value={stats.total} icon={<SpiderIcon size={16} />} />
<CrawlerMetric label="已就绪" value={stats.ready} icon={<Activity size={16} />} tone="ok" />
<CrawlerMetric label="抓取中" value={stats.running} icon={<RefreshCw size={16} />} tone="info" />
<CrawlerMetric label="错误" value={stats.error} icon={<CircleStop size={16} />} tone="error" />
</div>
<div className="admin-card admin-crawler-list">
<div className="admin-crawler-list__head">
<header className="admin-card__title">
<SpiderIcon size={16} />
</header>
{loading ? (
<div className="admin-loading">...</div>
) : list.length === 0 ? (
<div className="admin-empty"></div>
) : (
<div className="admin-drive-teasers">
{list.map((crawler) => (
<button
key={crawler.id}
type="button"
className={`admin-drive-teaser ${crawler.id === selectedId ? "is-active" : ""}`}
onClick={() => selectCrawler(crawler)}
>
<span className="admin-drive-teaser__name">
<span className="admin-drive-card__brand-icon" data-kind={crawler.builtin || crawler.kind}>
{crawler.builtin === "spider91" ? "91" : driveKindAbbr(crawler.kind)}
</span>
{crawler.name}
</span>
<span className={`admin-status is-${crawler.status === "ok" ? "ok" : crawler.status === "error" ? "error" : "pending"}`}>
{crawler.status === "ok" ? "已就绪" : crawler.status === "error" ? "错误" : "未连接"}
</span>
<button className="admin-btn" type="button" onClick={refresh} disabled={loading}>
<RefreshCw size={13} className={loading ? "admin-spin" : undefined} />
</button>
</div>
{loading ? (
<div className="admin-loading-state">
<RefreshCw size={18} className="admin-spin" />
<span>...</span>
</div>
) : list.length === 0 ? (
<div className="admin-crawler-empty">
<SpiderIcon size={28} />
<strong></strong>
<button className="admin-btn is-primary" type="button" onClick={createCustom}>
<Plus size={13} />
</button>
</div>
) : (
<div className="admin-crawler-table">
{list.map((crawler) => (
<CrawlerRow
key={crawler.id}
crawler={crawler}
active={crawler.id === selectedId}
running={runningId === crawler.id}
stopping={stoppingId === crawler.id}
onSelect={() => selectCrawler(crawler)}
onRun={() => run(crawler)}
onStop={() => stop(crawler)}
/>
))}
</div>
)}
</div>
</div>
) : (
<div className="admin-crawler-detail">
<div className="admin-card">
<header className="admin-card__title">
<SpiderIcon size={16} /> {selected ? "爬虫配置" : "添加爬虫"}
</header>
<div className="admin-crawler-editor">
<div className="admin-crawler-editor__main">
<div className="admin-crawler-section">
<div className="admin-crawler-section__head">
<span className="admin-crawler-section__icon"><Settings2 size={15} /></span>
<span className="admin-crawler-section__title"></span>
</div>
<div className="admin-crawler-script-name">
<span></span>
<strong>{form.name || "导入脚本后自动读取"}</strong>
</div>
</div>
<div className="admin-crawler-section">
<div className="admin-crawler-section__head">
<span className="admin-crawler-section__icon"><FileCode2 size={15} /></span>
<span className="admin-crawler-section__title"></span>
</div>
<div className="admin-form">
{!selected && (
<div className="admin-crawler-presets">
<button className={`admin-btn ${form.builtin === "" ? "is-primary" : ""}`} type="button" onClick={createCustom}>
<Plus size={13} />
</button>
<button className={`admin-btn ${form.builtin === "spider91" ? "is-primary" : ""}`} type="button" onClick={createSpider91}>
<SpiderIcon size={13} /> 91
</button>
</div>
)}
<div className="admin-form__row">
<label htmlFor="crawler-id"> ID *</label>
<input id="crawler-id" value={form.id} onChange={(e) => set("id", e.target.value)} disabled={!!selected} />
</div>
<div className="admin-form__row">
<label htmlFor="crawler-name"> *</label>
<input id="crawler-name" value={form.name} onChange={(e) => set("name", e.target.value)} />
</div>
{!form.builtin && (
<div className="admin-form__row">
<label htmlFor="crawler-script-url"></label>
<div className="admin-crawler-import">
@@ -327,19 +373,59 @@ export function CrawlersPage() {
<button className="admin-btn" type="button" onClick={importScriptURL} disabled={importingScript}>
<LinkIcon size={13} /> {importingScript ? "导入中..." : "链接导入"}
</button>
<button
className="admin-btn"
type="button"
onClick={testScript}
disabled={!form.scriptPath || importingScript || testingScript}
>
<TestTube size={13} /> {testingScript ? "测试中..." : "测试脚本"}
</button>
</div>
{form.scriptPath && <div className="admin-form__help"></div>}
{testResult && <CrawlerTestResult result={testResult} />}
</div>
)}
</div>
</div>
<div className="admin-crawler-section">
<div className="admin-crawler-section__head">
<span className="admin-crawler-section__icon"><Gauge size={15} /></span>
<span className="admin-crawler-section__title"></span>
</div>
<div className="admin-crawler-params">
<div className="admin-form__row">
<label htmlFor="crawler-target"></label>
<input id="crawler-target" value={form.targetNew} onChange={(e) => set("targetNew", e.target.value)} placeholder="10" />
</div>
<div className="admin-form__row">
<label htmlFor="crawler-proxy"></label>
<input id="crawler-proxy" value={form.proxy} onChange={(e) => set("proxy", e.target.value)} placeholder="http://127.0.0.1:7890" />
<input
id="crawler-proxy"
value={form.proxy}
onChange={(e) => {
set("proxy", e.target.value);
setTestResult(null);
}}
placeholder="http://127.0.0.1:7890"
/>
</div>
<div className="admin-detail-actions">
</div>
</div>
</div>
<aside className="admin-crawler-editor__side">
<div className="admin-crawler-action-panel">
<div className="admin-crawler-action-panel__head">
<span className="admin-crawler-action-panel__mark">
<SpiderIcon size={18} />
</span>
<div>
<strong>{selected ? "爬虫配置" : "添加爬虫"}</strong>
<span>{selected ? crawlerStatusLabel(selected) : "未保存"}</span>
</div>
</div>
<div className="admin-crawler-action-panel__buttons">
<button className="admin-btn is-primary" onClick={save} disabled={saving}>
<Save size={13} /> {saving ? "保存中..." : "保存"}
</button>
@@ -358,14 +444,14 @@ export function CrawlersPage() {
)}
</div>
</div>
</div>
{selected && (
<div className="admin-card admin-crawler-status">
<header className="admin-card__title">
<Download size={16} />
</header>
<div className="admin-gen-columns">
<div className="admin-crawler-side-panel">
<div className="admin-crawler-section__head">
<span className="admin-crawler-section__icon"><Activity size={15} /></span>
<span className="admin-crawler-section__title"></span>
</div>
<div className="admin-crawler-status-grid">
<CrawlerStatus label="抓取" status={selected.scanGenerationStatus} />
<CrawlerStatus label="封面" status={selected.thumbnailGenerationStatus} />
<CrawlerStatus label="预览视频" status={selected.previewGenerationStatus} />
@@ -374,12 +460,89 @@ export function CrawlersPage() {
{selected.lastError && <div className="admin-detail-error">{selected.lastError}</div>}
</div>
)}
</aside>
</div>
)}
</section>
);
}
function CrawlerMetric({ label, value, icon, tone }: { label: string; value: number; icon: ReactNode; tone?: "ok" | "info" | "error" }) {
return (
<div className={`admin-crawler-metric ${tone ? `is-${tone}` : ""}`}>
<span className="admin-crawler-metric__icon">{icon}</span>
<span>{label}</span>
<strong>{value}</strong>
</div>
);
}
function CrawlerRow({
crawler,
active,
running,
stopping,
onSelect,
onRun,
onStop,
}: {
crawler: api.AdminCrawler;
active: boolean;
running: boolean;
stopping: boolean;
onSelect: () => void;
onRun: () => void;
onStop: () => void;
}) {
return (
<div className={`admin-crawler-row ${active ? "is-active" : ""}`}>
<button type="button" className="admin-crawler-row__main" onClick={onSelect}>
<span className="admin-crawler-row__brand">
<SpiderIcon size={16} />
</span>
<span className="admin-crawler-row__title-wrap">
<strong>{crawler.name}</strong>
<span>{crawler.scriptPath ? "脚本已导入" : "未导入脚本"}</span>
</span>
<span className={`admin-status is-${crawler.status === "ok" ? "ok" : crawler.status === "error" ? "error" : "pending"}`}>
{crawlerStatusLabel(crawler)}
</span>
<ChevronRight size={16} className="admin-crawler-row__chevron" />
</button>
<div className="admin-crawler-row__states">
<CrawlerStateChip label="抓取" status={crawler.scanGenerationStatus} />
<CrawlerStateChip label="封面" status={crawler.thumbnailGenerationStatus} />
<CrawlerStateChip label="预览" status={crawler.previewGenerationStatus} />
<CrawlerStateChip label="指纹" status={crawler.fingerprintGenerationStatus} />
</div>
<div className="admin-crawler-row__meta">
<span><Gauge size={12} /> {crawler.targetNew || "10"} </span>
<span><Clock size={12} /> {formatLastCrawl(crawler.lastCrawlAt)}</span>
</div>
<div className="admin-crawler-row__actions">
<button className="admin-btn" type="button" onClick={onSelect}>
<Settings2 size={13} />
</button>
<button className="admin-btn" type="button" onClick={onRun} disabled={running}>
<Download size={13} /> {running ? "触发中..." : "立即抓取"}
</button>
<button className="admin-btn is-stop" type="button" onClick={onStop} disabled={stopping}>
<CircleStop size={13} /> {stopping ? "停止中..." : "停止"}
</button>
</div>
</div>
);
}
function CrawlerStateChip({ label, status }: { label: string; status?: api.DriveGenerationStatus }) {
const state = status?.state || "idle";
return (
<span className={`admin-crawler-state-chip is-${generationStateClass(state)}`}>
{label} · {label === "抓取" && state === "scanning" ? "抓取中" : generationStateLabel(state)}
</span>
);
}
function CrawlerStatus({ label, status }: { label: string; status?: api.DriveGenerationStatus }) {
const state = status?.state || "idle";
const labelText = label === "抓取" && state === "scanning" ? "抓取中" : generationStateLabel(state);
@@ -400,3 +563,89 @@ function CrawlerStatus({ label, status }: { label: string; status?: api.DriveGen
</div>
);
}
function CrawlerTestResult({ result }: { result: api.CrawlerDryRunResult }) {
const item = result.items[0];
const failure = crawlerTestFailure(result);
const media = result.mediaCheck;
const statusText = result.ok ? "测试通过" : "测试失败";
return (
<div className={`admin-crawler-test-result ${result.ok ? "is-ok" : "is-error"}`}>
<div className="admin-crawler-test-result__head">
<span className={`admin-status is-${result.ok ? "ok" : "error"}`}>{statusText}</span>
<span> {result.items.length} </span>
{result.durationMs > 0 && <span>{Math.round(result.durationMs / 1000)} </span>}
</div>
{failure && <div className="admin-crawler-test-result__error">{failure}</div>}
{item && (
<div className="admin-crawler-test-result__grid">
<CrawlerTestField label="视频名" value={item.title} />
<CrawlerTestField label="唯一标识" value={item.sourceId} />
<CrawlerTestField label="视频直链" value={item.mediaUrl || item.mediaLocalFile} />
<CrawlerTestField label="封面图" value={item.thumbnailUrl} />
<CrawlerTestField label="详情页" value={item.detailUrl} />
</div>
)}
{media && (
<div className="admin-crawler-test-result__media">
<span></span>
<strong>
{media.ok ? "可访问" : "不可访问"}
{media.status ? ` · HTTP ${media.status}` : ""}
{media.contentType ? ` · ${media.contentType}` : ""}
{media.contentLengthBytes ? ` · ${formatBytes(media.contentLengthBytes)}` : ""}
</strong>
</div>
)}
{result.log && result.log.length > 0 && (
<details className="admin-crawler-test-result__log">
<summary></summary>
<pre>{result.log.join("\n")}</pre>
</details>
)}
</div>
);
}
function CrawlerTestField({ label, value }: { label: string; value?: string | number }) {
if (value === undefined || value === "") return null;
return (
<div className="admin-crawler-test-result__field">
<span>{label}</span>
<strong>{value}</strong>
</div>
);
}
function crawlerTestFailure(result: api.CrawlerDryRunResult) {
return result.error || result.mediaCheck?.error || "";
}
function crawlerStatusLabel(crawler: api.AdminCrawler) {
if (crawler.status === "ok") return "已就绪";
if (crawler.status === "error") return "错误";
return "未连接";
}
function formatLastCrawl(ts?: number) {
if (!ts) return "未抓取";
return new Date(ts * 1000).toLocaleString("zh-CN", {
month: "2-digit",
day: "2-digit",
hour: "2-digit",
minute: "2-digit",
});
}
function formatBytes(bytes: number) {
if (!Number.isFinite(bytes) || bytes <= 0) return "";
if (bytes >= 1024 * 1024 * 1024) return `${(bytes / 1024 / 1024 / 1024).toFixed(1)} GB`;
if (bytes >= 1024 * 1024) return `${(bytes / 1024 / 1024).toFixed(1)} MB`;
if (bytes >= 1024) return `${(bytes / 1024).toFixed(1)} KB`;
return `${bytes} B`;
}
+37 -11
View File
@@ -195,14 +195,11 @@ export type AdminCrawler = {
id: string;
name: string;
kind: "scriptcrawler" | "spider91";
builtin?: string;
status: string;
lastError?: string;
scriptPath: string;
pythonPath?: string;
proxy?: string;
targetNew?: string;
configJson?: string;
lastCrawlAt?: number;
scanGenerationStatus?: DriveGenerationStatus;
thumbnailGenerationStatus?: DriveGenerationStatus;
@@ -220,18 +217,41 @@ export type AdminCrawler = {
};
export type UpsertCrawlerInput = {
id: string;
name: string;
builtin?: string;
id?: string;
scriptPath: string;
pythonPath?: string;
proxy?: string;
targetNew?: string;
configJson?: string;
};
export type ImportCrawlerScriptResult = {
scriptPath: string;
name: string;
};
export type CrawlerDryRunItem = {
title: string;
sourceId?: string;
mediaUrl?: string;
mediaLocalFile?: string;
thumbnailUrl?: string;
detailUrl?: string;
};
export type CrawlerDryRunMediaCheck = {
ok: boolean;
status?: number;
contentType?: string;
contentLengthBytes?: number;
error?: string;
};
export type CrawlerDryRunResult = {
ok: boolean;
items: CrawlerDryRunItem[];
mediaCheck?: CrawlerDryRunMediaCheck;
error?: string;
log?: string[];
durationMs: number;
};
export function listCrawlers() {
@@ -239,7 +259,7 @@ export function listCrawlers() {
}
export function upsertCrawler(body: UpsertCrawlerInput) {
return request<{ ok: boolean; warning?: string }>("/crawlers", {
return request<{ ok: boolean; id: string; warning?: string }>("/crawlers", {
method: "POST",
body: JSON.stringify(body),
});
@@ -261,6 +281,13 @@ export function importCrawlerScriptURL(url: string) {
});
}
export function testCrawlerScript(body: { scriptPath: string; proxy?: string }) {
return request<CrawlerDryRunResult>("/crawlers/test-script", {
method: "POST",
body: JSON.stringify(body),
});
}
export function runCrawler(id: string) {
return request<{ ok: boolean; accepted: boolean; message?: string; status?: NightlyJobStatus }>(
`/crawlers/${encodeURIComponent(id)}/run`,
@@ -276,9 +303,8 @@ export function stopCrawlerTasks(id: string) {
}
export function deleteCrawler(id: string) {
return request<{ ok: boolean; deletedVideos: number }>(`/crawlers/${encodeURIComponent(id)}`, {
return request<{ ok: boolean; deletedVideos: number; deletedScript?: boolean; warning?: string }>(`/crawlers/${encodeURIComponent(id)}`, {
method: "DELETE",
body: JSON.stringify({ deleteVideos: true }),
});
}
+1 -1
View File
@@ -163,7 +163,7 @@ export function credentialHelp(kind: Kind, isEdit: boolean): string {
case "localstorage":
return `填写服务器可访问的本地目录绝对路径,例如 /mnt/videos。系统会扫描该目录及子目录中的视频文件和 .strm 文件;.strm 可指向 HTTP/HTTPS 直链,或指向本地存储根目录内的真实视频路径。Docker 部署时请填写容器内路径。${note}`;
case "spider91":
return "91Spider 不再支持通过网盘添加或编辑。请到后台爬虫管理页面添加内置 91 或自定义爬虫脚本。";
return "91Spider 不再支持通过网盘添加或编辑。请到后台爬虫管理页面添加爬虫脚本。";
default:
return "";
}
+617 -1
View File
@@ -335,6 +335,541 @@
margin-bottom: var(--space-3);
}
/* =========================================================
* Crawler Management
* ========================================================= */
.admin-crawler-console {
display: grid;
gap: var(--space-4);
}
.admin-crawler-overview {
display: grid;
grid-template-columns: repeat(4, minmax(0, 1fr));
gap: var(--space-3);
}
.admin-crawler-metric {
display: grid;
grid-template-columns: 38px minmax(0, 1fr);
grid-template-areas:
"icon label"
"icon value";
align-items: center;
min-height: 76px;
padding: var(--space-4);
border: 1px solid var(--border-subtle);
border-radius: var(--radius-sm);
background: var(--bg-surface);
box-shadow: var(--shadow-sm);
}
.admin-crawler-metric__icon {
grid-area: icon;
width: 34px;
height: 34px;
display: grid;
place-items: center;
border-radius: var(--radius-xs);
color: var(--accent);
background: var(--accent-soft);
}
.admin-crawler-metric span:not(.admin-crawler-metric__icon) {
grid-area: label;
color: var(--text-faint);
font-size: var(--font-xs);
font-weight: var(--weight-medium);
}
.admin-crawler-metric strong {
grid-area: value;
color: var(--text-strong);
font-size: var(--font-2xl);
font-weight: var(--weight-bold);
line-height: 1.1;
font-variant-numeric: tabular-nums;
}
.admin-crawler-metric.is-ok .admin-crawler-metric__icon {
color: var(--success);
background: var(--success-soft);
}
.admin-crawler-metric.is-info .admin-crawler-metric__icon {
color: var(--info);
background: var(--info-soft);
}
.admin-crawler-metric.is-error .admin-crawler-metric__icon {
color: var(--danger);
background: var(--danger-soft);
}
.admin-crawler-list {
padding: 0;
overflow: hidden;
}
.admin-crawler-list__head {
display: flex;
align-items: center;
justify-content: space-between;
gap: var(--space-3);
padding: var(--space-4) var(--space-5);
border-bottom: 1px solid var(--border-subtle);
}
.admin-crawler-list__head .admin-card__title {
margin-bottom: 0;
}
.admin-spin {
animation: admin-update-spin 0.9s linear infinite;
transform-box: fill-box;
transform-origin: center;
will-change: transform;
}
.admin-crawler-empty {
min-height: 280px;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
gap: var(--space-3);
padding: var(--space-7) var(--space-4);
color: var(--text-faint);
text-align: center;
}
.admin-crawler-empty svg {
color: var(--accent);
}
.admin-crawler-empty strong {
color: var(--text-strong);
font-size: var(--font-lg);
}
.admin-crawler-table {
display: grid;
}
.admin-crawler-row {
display: grid;
grid-template-columns: minmax(260px, 1.25fr) minmax(260px, 1fr) minmax(150px, 0.55fr) auto;
align-items: center;
gap: var(--space-3);
padding: var(--space-3) var(--space-5);
border-bottom: 1px solid var(--border-subtle);
background: transparent;
transition: background var(--transition-fast), border-color var(--transition-fast);
}
.admin-crawler-row:last-child {
border-bottom: 0;
}
.admin-crawler-row:hover,
.admin-crawler-row.is-active {
background: rgba(255, 255, 255, 0.025);
}
.admin-crawler-row.is-active {
box-shadow: inset 3px 0 0 var(--accent);
}
.admin-crawler-row__main {
appearance: none;
width: 100%;
min-width: 0;
display: grid;
grid-template-columns: 38px minmax(0, 1fr) auto 18px;
align-items: center;
gap: var(--space-3);
padding: 0;
border: 0;
background: transparent;
color: inherit;
font: inherit;
text-align: left;
cursor: pointer;
}
.admin-crawler-row__main:focus-visible {
outline: 2px solid var(--accent);
outline-offset: 4px;
border-radius: var(--radius-sm);
}
.admin-crawler-row__brand {
width: 38px;
height: 38px;
display: grid;
place-items: center;
border-radius: var(--radius-xs);
color: var(--accent);
background: var(--accent-soft);
border: 1px solid rgba(255, 138, 60, 0.2);
}
.admin-crawler-row__title-wrap {
min-width: 0;
display: grid;
gap: 3px;
}
.admin-crawler-row__title-wrap strong {
min-width: 0;
color: var(--text-strong);
font-size: var(--font-md);
font-weight: var(--weight-semibold);
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.admin-crawler-row__title-wrap span {
color: var(--text-faint);
font-size: var(--font-xs);
}
.admin-crawler-row__chevron {
color: var(--text-faint);
transition: transform var(--transition-fast), color var(--transition-fast);
}
.admin-crawler-row__main:hover .admin-crawler-row__chevron {
color: var(--text-strong);
transform: translateX(2px);
}
.admin-crawler-row__states {
display: flex;
flex-wrap: wrap;
align-items: center;
gap: 6px;
min-width: 0;
}
.admin-crawler-state-chip {
display: inline-flex;
align-items: center;
min-height: 24px;
padding: 3px 8px;
border-radius: var(--radius-pill);
border: 1px solid transparent;
font-size: var(--font-xs);
font-weight: var(--weight-medium);
white-space: nowrap;
}
.admin-crawler-state-chip.is-idle {
color: var(--success);
background: var(--success-soft);
}
.admin-crawler-state-chip.is-generating {
color: var(--info);
background: var(--info-soft);
}
.admin-crawler-state-chip.is-cooling {
color: var(--warning);
background: var(--warning-soft);
}
.admin-crawler-state-chip.is-queued {
color: var(--text-muted);
background: rgba(255, 255, 255, 0.06);
border-color: var(--border-subtle);
}
.admin-crawler-row__meta {
display: grid;
gap: 6px;
color: var(--text-muted);
font-size: var(--font-xs);
}
.admin-crawler-row__meta span {
display: inline-flex;
align-items: center;
gap: 6px;
min-width: 0;
white-space: nowrap;
}
.admin-crawler-row__actions {
display: flex;
align-items: center;
justify-content: flex-end;
flex-wrap: wrap;
gap: var(--space-2);
}
.admin-crawler-editor {
display: grid;
grid-template-columns: minmax(0, 1fr) 320px;
gap: var(--space-5);
align-items: start;
}
.admin-crawler-editor__main {
display: grid;
gap: var(--space-4);
min-width: 0;
}
.admin-crawler-editor__side {
display: grid;
gap: var(--space-4);
position: sticky;
top: var(--space-5);
min-width: 0;
}
.admin-crawler-section,
.admin-crawler-action-panel,
.admin-crawler-side-panel {
border: 1px solid var(--border-subtle);
border-radius: var(--radius-sm);
background: var(--bg-surface);
box-shadow: var(--shadow-sm);
}
.admin-crawler-section {
padding: var(--space-5);
}
.admin-crawler-section__head {
display: flex;
align-items: center;
gap: var(--space-2);
margin-bottom: var(--space-4);
}
.admin-crawler-section__icon {
width: 28px;
height: 28px;
display: grid;
place-items: center;
border-radius: var(--radius-xs);
color: var(--accent);
background: var(--accent-soft);
flex: 0 0 auto;
}
.admin-crawler-section__title {
color: var(--text-strong);
font-size: var(--font-md);
font-weight: var(--weight-semibold);
}
.admin-crawler-section .admin-form {
max-width: 100%;
}
.admin-crawler-script-name {
display: grid;
grid-template-columns: 92px minmax(0, 1fr);
gap: var(--space-3);
align-items: center;
min-height: 42px;
padding: var(--space-3);
border: 1px solid var(--border-subtle);
border-radius: var(--radius-sm);
background: var(--bg-sunken);
}
.admin-crawler-script-name span {
color: var(--text-faint);
font-size: var(--font-xs);
font-weight: var(--weight-medium);
}
.admin-crawler-script-name strong {
min-width: 0;
color: var(--text-strong);
font-size: var(--font-sm);
font-weight: var(--weight-semibold);
overflow-wrap: anywhere;
}
.admin-crawler-params {
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: var(--space-4);
}
.admin-crawler-action-panel {
padding: var(--space-4);
}
.admin-crawler-action-panel__head {
display: flex;
align-items: center;
gap: var(--space-3);
padding-bottom: var(--space-4);
border-bottom: 1px solid var(--border-subtle);
}
.admin-crawler-action-panel__mark {
width: 42px;
height: 42px;
display: grid;
place-items: center;
border-radius: var(--radius-xs);
color: var(--text-on-accent);
background: var(--accent);
box-shadow: var(--shadow-sm);
}
.admin-crawler-action-panel__head > div {
display: grid;
gap: 3px;
min-width: 0;
}
.admin-crawler-action-panel__head strong {
color: var(--text-strong);
font-size: var(--font-md);
font-weight: var(--weight-semibold);
}
.admin-crawler-action-panel__head > div span {
color: var(--text-faint);
font-size: var(--font-xs);
}
.admin-crawler-action-panel__buttons {
display: grid;
gap: var(--space-2);
padding-top: var(--space-4);
}
.admin-crawler-action-panel__buttons .admin-btn {
width: 100%;
}
.admin-crawler-side-panel {
padding: var(--space-4);
}
.admin-crawler-side-panel .admin-crawler-section__head {
margin-bottom: var(--space-3);
}
.admin-crawler-status-grid {
display: grid;
gap: var(--space-3);
}
.admin-crawler-status-grid .admin-gen-col {
background: var(--bg-sunken);
}
.admin-btn[aria-disabled="true"] {
opacity: 0.45;
cursor: not-allowed;
pointer-events: none;
}
@media (max-width: 1180px) {
.admin-crawler-row {
grid-template-columns: minmax(260px, 1fr) minmax(220px, 0.9fr);
}
.admin-crawler-row__meta,
.admin-crawler-row__actions {
grid-column: 1 / -1;
}
.admin-crawler-row__meta {
display: flex;
flex-wrap: wrap;
gap: var(--space-3);
}
.admin-crawler-row__actions {
justify-content: flex-start;
}
}
@media (max-width: 1024px) {
.admin-crawler-editor {
grid-template-columns: 1fr;
}
.admin-crawler-editor__side {
position: static;
}
.admin-crawler-action-panel__buttons {
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
}
}
@media (max-width: 760px) {
.admin-crawler-overview {
grid-template-columns: repeat(2, minmax(0, 1fr));
}
.admin-crawler-list__head {
align-items: stretch;
flex-direction: column;
}
.admin-crawler-list__head .admin-btn {
width: 100%;
}
.admin-crawler-row {
grid-template-columns: 1fr;
padding: var(--space-4);
}
.admin-crawler-row__main {
grid-template-columns: 38px minmax(0, 1fr) auto;
}
.admin-crawler-row__main .admin-status {
justify-self: start;
grid-column: 2 / 3;
}
.admin-crawler-row__chevron {
grid-column: 3 / 4;
grid-row: 1 / 3;
}
.admin-crawler-params {
grid-template-columns: 1fr;
}
.admin-crawler-script-name {
grid-template-columns: 1fr;
gap: 4px;
}
.admin-crawler-row__actions .admin-btn {
flex: 1 1 120px;
}
}
@media (max-width: 520px) {
.admin-crawler-overview {
grid-template-columns: 1fr;
}
.admin-crawler-section,
.admin-crawler-action-panel,
.admin-crawler-side-panel {
padding: var(--space-4);
}
}
/* ----- Storage summary ----- */
.admin-storage-summary {
display: grid;
@@ -475,7 +1010,7 @@
.admin-crawler-import {
display: grid;
grid-template-columns: auto minmax(180px, 1fr) auto;
grid-template-columns: auto minmax(180px, 1fr) auto auto;
gap: var(--space-2);
align-items: center;
}
@@ -494,6 +1029,87 @@
line-height: var(--line-relaxed);
}
.admin-crawler-test-result {
display: grid;
gap: var(--space-3);
margin-top: var(--space-3);
padding: var(--space-3);
border: 1px solid var(--border-subtle);
border-radius: var(--radius-sm);
background: var(--bg-elevated);
}
.admin-crawler-test-result.is-ok {
border-color: var(--success);
}
.admin-crawler-test-result.is-error {
border-color: var(--danger);
}
.admin-crawler-test-result__head {
display: flex;
align-items: center;
gap: var(--space-2);
flex-wrap: wrap;
font-size: var(--font-xs);
color: var(--text-muted);
}
.admin-crawler-test-result__error {
padding: var(--space-2) var(--space-3);
border-radius: var(--radius-sm);
background: var(--danger-soft);
color: var(--danger);
font-size: var(--font-sm);
line-height: var(--line-relaxed);
word-break: break-word;
}
.admin-crawler-test-result__grid {
display: grid;
gap: var(--space-2);
}
.admin-crawler-test-result__field,
.admin-crawler-test-result__media {
display: grid;
grid-template-columns: 82px minmax(0, 1fr);
gap: var(--space-2);
align-items: baseline;
font-size: var(--font-xs);
}
.admin-crawler-test-result__field span,
.admin-crawler-test-result__media span {
color: var(--text-faint);
}
.admin-crawler-test-result__field strong,
.admin-crawler-test-result__media strong {
color: var(--text-strong);
font-weight: var(--weight-medium);
min-width: 0;
overflow-wrap: anywhere;
}
.admin-crawler-test-result__log {
font-size: var(--font-xs);
color: var(--text-muted);
}
.admin-crawler-test-result__log summary {
cursor: pointer;
}
.admin-crawler-test-result__log pre {
margin: var(--space-2) 0 0;
max-height: 180px;
overflow: auto;
white-space: pre-wrap;
color: var(--text-muted);
}
.admin-p123-qr {
display: grid;
gap: var(--space-3);
+12 -1
View File
@@ -221,18 +221,29 @@ test("crawler management is a separate admin section", () => {
assert.match(crawlerPageSource, /api\.deleteCrawler/);
assert.match(crawlerPageSource, /api\.importCrawlerScriptFile/);
assert.match(crawlerPageSource, /api\.importCrawlerScriptURL/);
assert.match(crawlerPageSource, /api\.testCrawlerScript/);
assert.match(crawlerPageSource, /type="file"/);
assert.match(crawlerPageSource, /链接导入/);
assert.match(crawlerPageSource, /测试脚本/);
assert.match(crawlerPageSource, /测试通过/);
assert.doesNotMatch(crawlerPageSource, /新建脚本/);
assert.doesNotMatch(crawlerPageSource, /爬虫 ID/);
assert.doesNotMatch(crawlerPageSource, /crawler-id/);
assert.doesNotMatch(crawlerPageSource, /crawler-name/);
assert.doesNotMatch(crawlerPageSource, /脚本路径/);
assert.doesNotMatch(crawlerPageSource, /Python 解释器/);
assert.doesNotMatch(crawlerPageSource, /自定义配置 JSON/);
assert.doesNotMatch(crawlerPageSource, /Bot/);
assert.match(crawlerPageSource, /builtin:\s*"spider91"/);
// 项目不再内置任何爬虫:不允许出现内置 91 预设
assert.doesNotMatch(crawlerPageSource, /builtin/);
assert.doesNotMatch(crawlerPageSource, /内置 91/);
assert.match(apiSource, /type AdminCrawler/);
assert.match(apiSource, /"\/crawlers"/);
assert.match(apiSource, /"\/crawlers\/import-file"/);
assert.match(apiSource, /"\/crawlers\/import-url"/);
assert.match(apiSource, /"\/crawlers\/test-script"/);
assert.match(apiSource, /type CrawlerDryRunResult/);
assert.match(apiSource, /id\?: string/);
assert.match(apiSource, /new FormData\(\)/);
assert.doesNotMatch(driveFormSource, /scriptcrawler/);
});