feat(crawler): redesign crawler scripts and admin workflow

- add generic scriptcrawler backend runner using the crawler.v1 JSONL protocol

- support crawler script upload and HTTP(S) URL import from the admin crawler page

- simplify the user-facing crawler contract to title, media_url, optional thumbnail_url and optional source_id

- convert Spider91 into a normal script crawler and reject new Spider91 storage-drive configs

- keep legacy Spider91 storage rows visible only for cleanup/deletion

- add crawler protocol docs, example script, admin UI, tests and migration coverage
This commit is contained in:
nianzhibai
2026-06-09 23:49:28 +08:00
parent 71d4a16db1
commit ec5a01b6aa
25 changed files with 3577 additions and 233 deletions
+133 -5
View File
@@ -12,6 +12,9 @@
pip install requests beautifulsoup4 lxml PySocks
使用方法:
# 作为 video-site-91 通用爬虫脚本运行(后台会自动这样调用)
python spider_91porn.py --job /path/to/job.json
# 全量爬取(默认行为,从 page=1 一直爬到末尾,写到 OUTPUT_FILE
python spider_91porn.py
@@ -22,6 +25,7 @@
python spider_91porn.py --target-new 15 --seen-viewkeys-file /tmp/seen.txt --output /tmp/new.json
CLI 参数:
--job FILE crawler.v1 job JSON 路径;后台爬虫管理会使用此模式
--page N 只爬第 N 页,配合 --output 用于手动调试
--target-new N 从 page 1 起翻页直到凑够 N 个新视频(不在 seen 列表里的)
--seen-viewkeys-file FILE 每行一个已知 viewkey 或 mp4 源 ID,命中即跳过;与 --target-new 配合使用
@@ -37,6 +41,8 @@ CLI 参数:
- OUTPUT_FILE : 输出文件名
输出格式 (JSON):
--job 模式下 stdout 输出 crawler.v1 JSON Lines,日志全部写到 stderr。
手动运行模式仍会写传统 JSON 文件:
{
"videos": [
{
@@ -77,8 +83,8 @@ from datetime import datetime
try:
from bs4 import BeautifulSoup
except ImportError:
print("错误: 缺少依赖库 beautifulsoup4")
print("请运行: pip install beautifulsoup4 lxml")
print("错误: 缺少依赖库 beautifulsoup4", file=sys.stderr)
print("请运行: pip install beautifulsoup4 lxml", file=sys.stderr)
sys.exit(1)
@@ -148,9 +154,23 @@ OUTPUT_FILE = "91porn_videos.json"
MAX_PAGES = None # 设置为 None 爬取所有页,或设置整数如 5 只爬前5页
RESUME = True # 是否跳过输出文件中已存在的 viewkey (断点续爬)
MAX_EMPTY_PAGES = 2 # 连续空页数达到此值时停止爬取
CRAWLER_PROTOCOL = "crawler.v1"
# ===================================================
def crawler_source_id(raw: str) -> str:
"""Return a backend-safe source_id, preserving existing numeric 91 IDs."""
value = str(raw or "").strip()
if not value:
return ""
safe = re.sub(r"[^A-Za-z0-9_.-]+", "_", value).strip("._-")
return safe[:160]
def write_jsonl(event: dict):
print(json.dumps(event, ensure_ascii=False), flush=True)
class Porn91Spider:
def __init__(
self,
@@ -163,6 +183,7 @@ class Porn91Spider:
target_new: int = None,
seen_viewkeys: list = None,
stream_output: bool = False,
stream_protocol: str = "legacy",
):
"""
构造函数。所有参数都有默认值,等同于使用脚本顶部的全局配置。
@@ -198,6 +219,7 @@ class Porn91Spider:
# (配合 backend Go 端 bufio.Scanner 实时消费,下载一个就开始下一个)。
# 开启后所有 log 都走 stderr。
self.stream_output = bool(stream_output)
self.stream_protocol = stream_protocol or "legacy"
# 添加重试适配器
try:
@@ -263,7 +285,28 @@ class Porn91Spider:
if not self.stream_output:
return
try:
print(json.dumps(video, ensure_ascii=False), flush=True)
if self.stream_protocol == "crawler.v1":
source_id = crawler_source_id(video.get("source_id") or video.get("viewkey") or "")
item = {
"title": video.get("title") or "",
"detail_url": video.get("detail_url") or "",
"author": "91porn",
"tags": ["91porn"],
"media_url": video.get("video_url") or "",
"thumbnail_url": video.get("thumb_url") or "",
"headers": {
"Referer": video.get("detail_url") or BASE_URL,
},
}
if source_id:
item["source_id"] = source_id
event = {
"type": "item",
"item": item,
}
write_jsonl(event)
else:
print(json.dumps(video, ensure_ascii=False), flush=True)
except Exception as e:
# stdout 异常基本只在管道断开时发生(消费方进程死了);
# 写到 stderr 让 backend 看到,然后让 crawl 循环自己 break。
@@ -697,8 +740,9 @@ class Porn91Spider:
except Exception as e:
self.log(f"保存文件失败: {e}")
# 尝试输出到控制台作为备份
print("\n--- 备份输出 ---")
print(json.dumps(output_data, ensure_ascii=False, indent=2))
backup_out = sys.stderr if self.stream_output else sys.stdout
print("\n--- 备份输出 ---", file=backup_out, flush=True)
print(json.dumps(output_data, ensure_ascii=False, indent=2), file=backup_out, flush=True)
def _print_summary(self):
"""
@@ -751,6 +795,84 @@ def print_help():
""")
def run_job(job_path: str):
"""Run as a crawler.v1 script plugin.
The Go host passes a job JSON file and expects stdout JSONL events. Logs go
to stderr so stdout stays machine-readable.
"""
with open(job_path, "r", encoding="utf-8") as f:
job = json.load(f)
if job.get("protocol") != CRAWLER_PROTOCOL:
raise ValueError(f"unsupported crawler protocol: {job.get('protocol')!r}")
if job.get("mode") not in ("", None, "crawl"):
raise ValueError(f"unsupported crawler mode: {job.get('mode')!r}")
try:
target_new = int(job.get("target_new") or 15)
except (TypeError, ValueError):
target_new = 15
if target_new <= 0:
target_new = 15
seen_file = job.get("seen_source_ids_file") or ""
output_dir = job.get("output_dir") or os.getcwd()
run_id = job.get("run_id") or datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f"spider91-{run_id}.json")
network = job.get("network") if isinstance(job.get("network"), dict) else {}
proxy_url = str(network.get("proxy_url") or "").strip()
if proxy_url:
os.environ["HTTP_PROXY"] = proxy_url
os.environ["HTTPS_PROXY"] = proxy_url
os.environ["http_proxy"] = proxy_url
os.environ["https_proxy"] = proxy_url
os.environ["NO_PROXY"] = ""
os.environ["no_proxy"] = ""
seen_viewkeys = []
if seen_file:
try:
with open(seen_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
seen_viewkeys.append(line)
except FileNotFoundError:
print(f"警告: seen_source_ids_file 不存在: {seen_file}", file=sys.stderr, flush=True)
except Exception as e:
print(f"警告: 读取 seen_source_ids_file 失败: {e}", file=sys.stderr, flush=True)
prefer_ipv4_for_plain_socks5_proxy()
spider = Porn91Spider(
output_file=output_file,
start_page=1,
max_pages=None,
resume=False,
quiet=True,
target_new=target_new,
seen_viewkeys=seen_viewkeys,
stream_output=True,
stream_protocol="crawler.v1",
)
try:
spider.crawl()
done = {
"type": "done",
"stats": {
"emitted": spider.processed_videos,
"failed": spider.failed_videos,
"skipped": spider.skipped_videos,
},
}
write_jsonl(done)
except KeyboardInterrupt:
spider.log("\n用户中断,正在保存已爬取的数据...")
spider._save_results()
raise
def main():
if len(sys.argv) > 1 and sys.argv[1] in ('-h', '--help', 'help'):
print_help()
@@ -778,8 +900,14 @@ def main():
parser.add_argument("--stream-output", action="store_true",
help="流式模式:每解析一条视频直链就立即把它作为一行 JSON 写到 stdout 并 flush"
"日志改走 stderr。配合 backend 边读边下载使用。")
parser.add_argument("--job", type=str, default=None,
help="crawler.v1 job JSON 路径;作为通用脚本爬虫运行。")
args, _ = parser.parse_known_args()
if args.job:
run_job(args.job)
return
cli_out = sys.stderr if args.stream_output else sys.stdout
prefer_ipv4_for_plain_socks5_proxy()
+7
View File
@@ -23,6 +23,7 @@ internal/
onedrive/ OneDriveOpenList 在线续期 + Microsoft Graph 文件接口)
googledrive/ Google DriveOpenList 在线续期 + Google Drive API;播放走后端代理)
localstorage/ 本地目录扫描(服务器已有视频目录)
scriptcrawler/ 通用脚本爬虫输出的本地媒体适配层
scanner/ 扫目录 → 落库
preview/ ffmpeg 抽封面和生成多段预览视频
proxy/ /p/stream/*、/p/preview/* 代理
@@ -79,6 +80,12 @@ npm run preview 前端 9191,无热更新
go run ./cmd/server 后端 9192
```
## 爬虫脚本
爬虫现在是独立后台栏目 `/admin/crawlers`,不再作为“网盘/存储类型”配置。脚本负责发现视频,后端负责去重、下载、入库、封面、预览视频和视频指纹。
脚本协议见 [docs/crawler-protocol.md](../docs/crawler-protocol.md)。后台支持上传 `.py` 文件或通过 HTTP(S) 脚本链接导入,导入后的脚本会保存到数据目录旁的 `crawler-scripts/`。内置 91 爬虫也支持同一套 `crawler.v1` job 协议;后台“内置 91”会自动使用仓库里的 `91VideoSpider/spider_91porn.py`
## 添加一个盘
推荐在前端管理后台 `/admin/drives` 新增网盘。保存后会立即挂载并触发扫描;视频结果可在 `/admin/videos` 按网盘查看,每页 100 条,页面会同时显示各网盘预览视频已生成、待生成、失败数量。
+156 -95
View File
@@ -33,6 +33,7 @@ import (
"github.com/video-site/backend/internal/drives/p123"
"github.com/video-site/backend/internal/drives/pikpak"
"github.com/video-site/backend/internal/drives/quark"
"github.com/video-site/backend/internal/drives/scriptcrawler"
"github.com/video-site/backend/internal/drives/spider91"
"github.com/video-site/backend/internal/drives/wopan"
"github.com/video-site/backend/internal/fingerprint"
@@ -45,6 +46,7 @@ import (
)
const fingerprintReconcileInterval = time.Minute
const legacySpider91DriveUnsupported = "91Spider 已不再支持作为网盘配置,请在爬虫管理页面添加爬虫脚本"
func main() {
cfgPath := "./config.yaml"
@@ -76,7 +78,7 @@ func main() {
workers: make(map[string]*preview.Worker),
thumbWorkers: make(map[string]*preview.ThumbWorker),
fingerprintWorkers: make(map[string]*fingerprint.Worker),
spider91Crawlers: make(map[string]*spider91.Crawler),
scriptCrawlers: make(map[string]*scriptcrawler.Crawler),
}
app.proxy = proxy.New(app.registry)
app.spider91Migrator = spider91migrate.New(spider91migrate.Config{
@@ -171,13 +173,23 @@ func main() {
app.detachDrive(driveID)
},
OnScanRequested: func(driveID string) bool {
// spider91 的"重扫"等同于手动触发一次爬取;其它 drive 走标准 scan
app.mu.Lock()
_, isSpider91 := app.spider91Crawlers[driveID]
app.mu.Unlock()
// 爬虫类 drive 的"重扫"等同于手动触发一次爬取;其它 drive 走标准 scan
isSpider91 := false
isScriptCrawler := false
if d, err := app.cat.GetDrive(ctx, driveID); err == nil && d != nil {
if d.Kind == spider91.Kind {
log.Printf("[spider91] drive=%s is a deprecated storage crawler, ignore scan request", driveID)
return false
}
isSpider91 = scriptCrawlerSourceKindForDrive(d) == spider91.Kind
isScriptCrawler = d.Kind == scriptcrawler.Kind
}
if isSpider91 {
return app.scheduleSpider91Crawl(ctx, driveID)
}
if isScriptCrawler {
return app.scheduleScriptCrawlerCrawl(ctx, driveID)
}
return app.scheduleScan(ctx, driveID)
},
OnStopDriveTasks: func(driveID string) bool {
@@ -227,6 +239,9 @@ func main() {
SetSpider91UploadDriveID: func(id string) error {
return app.SetSpider91UploadDriveID(ctx, id)
},
DefaultSpider91ScriptPath: func() string {
return app.defaultSpider91ScriptPath()
},
OnRunNightlyJob: func() bool {
if app.nightlyRunner != nil {
return app.nightlyRunner.TriggerNow()
@@ -304,8 +319,9 @@ type App struct {
thumbWorkers map[string]*preview.ThumbWorker
fingerprintWorkers map[string]*fingerprint.Worker
cancels map[string]context.CancelFunc
// spider91Crawlers 按 driveID 索引,每个 spider91 drive 独立一个 Crawler
spider91Crawlers map[string]*spider91.Crawler
// scriptCrawlers 按 driveID 索引,每个脚本爬虫 drive 独立一个 Crawler
// 内置 Spider91 也走这里,只是 SourceKind=spider91,以兼容历史 video id。
scriptCrawlers map[string]*scriptcrawler.Crawler
// driveAttachMu 串行化云盘挂载/重挂载。挂载会访问上游服务,可能较慢;
// 串行化可以避免启动后台挂载和手动扫盘按需挂载同一个 drive 时重复创建 worker。
@@ -737,11 +753,16 @@ func (a *App) attachDriveUnlocked(ctx context.Context, d *catalog.Drive) error {
ID: d.ID,
RootPath: d.Credentials["path"],
})
case spider91.Kind:
drv = spider91.New(spider91.Config{
case scriptcrawler.Kind:
drv = scriptcrawler.New(scriptcrawler.Config{
ID: d.ID,
RootDir: a.spider91DriveDir(d.ID),
RootDir: a.scriptCrawlerDriveDirForDrive(d),
})
case spider91.Kind:
d.Status = "error"
d.LastError = legacySpider91DriveUnsupported
_ = a.cat.UpsertDrive(ctx, d)
return errors.New(legacySpider91DriveUnsupported)
default:
return fmt.Errorf("unknown drive kind: %s", d.Kind)
}
@@ -761,9 +782,8 @@ func (a *App) attachDriveUnlocked(ctx context.Context, d *catalog.Drive) error {
a.startDriveGenerationWorkers(ctx, d.ID, drv, true)
// spider91 driver 还需要一个 crawler,挂在专用 map 里供 crawlerLoop 调用
if sd, ok := drv.(*spider91.Driver); ok {
a.attachSpider91Crawler(d, sd)
if sd, ok := drv.(*scriptcrawler.Driver); ok {
a.attachScriptCrawler(d, sd)
}
return nil
@@ -836,6 +856,26 @@ func (a *App) spider91DriveDir(driveID string) string {
return filepath.Join(a.spider91RootDir(), driveID)
}
// scriptCrawlerRootDir 是所有通用脚本爬虫 drive 共享的根目录。
func (a *App) scriptCrawlerRootDir() string {
return filepath.Join(filepath.Dir(a.cfg.Storage.LocalPreviewDir), "scriptcrawlers")
}
// scriptCrawlerDriveDir 是单个 scriptcrawler drive 的存储目录:<root>/<driveID>。
func (a *App) scriptCrawlerDriveDir(driveID string) string {
return filepath.Join(a.scriptCrawlerRootDir(), driveID)
}
func (a *App) scriptCrawlerDriveDirForDrive(d *catalog.Drive) string {
if d != nil && scriptCrawlerSourceKindForDrive(d) == spider91.Kind {
return a.spider91DriveDir(d.ID)
}
if d == nil {
return a.scriptCrawlerDriveDir("")
}
return a.scriptCrawlerDriveDir(d.ID)
}
// commonThumbsDir 是所有 drive 共享的封面目录,/p/thumb/{videoID} 路由命中这里。
func (a *App) commonThumbsDir() string {
return filepath.Join(a.cfg.Storage.LocalPreviewDir, "thumbs")
@@ -865,77 +905,72 @@ func (a *App) defaultSpider91ScriptPath() string {
return ""
}
// attachSpider91Crawler 创建该 drive 对应的 Crawler 并注册到 a.spider91Crawlers。
func (a *App) attachSpider91Crawler(d *catalog.Drive, drv *spider91.Driver) {
// attachScriptCrawler 创建通用脚本爬虫 runner并注册到 a.scriptCrawlers。
func (a *App) attachScriptCrawler(d *catalog.Drive, drv *scriptcrawler.Driver) {
pythonPath := strings.TrimSpace(d.Credentials["python_path"])
if pythonPath == "" {
pythonPath = "python3"
}
scriptPath := strings.TrimSpace(d.Credentials["script_path"])
if scriptPath == "" {
sourceKind := scriptCrawlerSourceKindForDrive(d)
if scriptPath == "" && sourceKind == spider91.Kind {
scriptPath = a.defaultSpider91ScriptPath()
}
// 91porn CDN 在海外;空缺时回退到 HTTPS_PROXY / HTTP_PROXY 环境变量。
proxyURL := strings.TrimSpace(d.Credentials["proxy"])
configJSON := strings.TrimSpace(d.Credentials["config_json"])
workDir := ""
if scriptPath != "" {
workDir = filepath.Dir(scriptPath)
}
driveID := d.ID
var progressMu sync.Mutex
checkedVideos := 0
expectedNewVideos := 0
updateProgress := func(scanned, added int) {
a.updateDriveScanProgress(driveID, scanned, added)
}
c := spider91.NewCrawler(spider91.CrawlerConfig{
c := scriptcrawler.NewCrawler(scriptcrawler.CrawlerConfig{
Driver: drv,
Catalog: a.cat,
SourceKind: sourceKind,
PythonPath: pythonPath,
ScriptPath: scriptPath,
WorkDir: filepath.Dir(scriptPath),
WorkDir: workDir,
CommonThumbDir: a.commonThumbsDir(),
ProxyURL: proxyURL,
OnProgress: func(progress spider91.CrawlProgress) {
progressMu.Lock()
if progress.TotalEntries == 0 && progress.NewVideos == 0 && progress.Skipped == 0 && progress.Failed == 0 {
checkedVideos = 0
expectedNewVideos = 0
} else if progress.TotalEntries > expectedNewVideos {
expectedNewVideos = progress.TotalEntries
ConfigJSON: configJSON,
OnProgress: func(progress scriptcrawler.CrawlProgress) {
scanned := progress.Checked
if scanned < progress.TotalEntries {
scanned = progress.TotalEntries
}
scanned := checkedVideos
added := expectedNewVideos
progressMu.Unlock()
updateProgress(scanned, added)
added := progress.Emitted
if added < progress.NewVideos {
added = progress.NewVideos
}
a.updateDriveScanProgress(driveID, scanned, added)
},
OnCheckedVideo: func() {
progressMu.Lock()
checkedVideos++
scanned := checkedVideos
added := expectedNewVideos
progressMu.Unlock()
updateProgress(scanned, added)
},
OnExtractedVideo: func() {
progressMu.Lock()
expectedNewVideos++
scanned := checkedVideos
added := expectedNewVideos
progressMu.Unlock()
updateProgress(scanned, added)
},
// 新流程:预览视频不在每条视频入库时立即入队,而是 RunOnce 全部下完后由
// runSpider91Crawl 统一调 enqueueDriveGeneration 一次性入队。这样:
// - 下载阶段不和 ffmpeg 抢 CPU/IO
// - "等待预览视频队列 idle" 在 nightly Phase 2 的语义上更直观
// 不再传 OnNewVideocrawler 内部的回调字段保留,仅为单测计数器之用)。
})
a.mu.Lock()
a.spider91Crawlers[driveID] = c
a.scriptCrawlers[driveID] = c
a.mu.Unlock()
// 确保 "91porn" 系统标签存在,并按 spider91 来源前缀给历史视频补打。
// 不能只靠文本匹配:老版本入库的视频可能没有 author/tags 字段,但 id 前缀
// "spider91-<driveID>-" 会一直保留,即使后续迁移到 PikPak/115 也不变。
if sourceKind == spider91.Kind {
a.ensureSpider91SourceTag(driveID)
}
}
func scriptCrawlerSourceKindForDrive(d *catalog.Drive) string {
if d == nil {
return scriptcrawler.Kind
}
if d.Kind == scriptcrawler.Kind && strings.EqualFold(strings.TrimSpace(d.Credentials["builtin"]), spider91.Kind) {
return spider91.Kind
}
return scriptcrawler.Kind
}
func isSpider91SourceDrive(d *catalog.Drive) bool {
return d != nil && (strings.EqualFold(d.Kind, spider91.Kind) || scriptCrawlerSourceKindForDrive(d) == spider91.Kind)
}
func (a *App) ensureSpider91SourceTag(driveID string) {
bgCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
go func() {
defer cancel()
@@ -1455,7 +1490,7 @@ func (a *App) detachDrive(id string) {
delete(a.workers, id)
delete(a.thumbWorkers, id)
delete(a.fingerprintWorkers, id)
delete(a.spider91Crawlers, id)
delete(a.scriptCrawlers, id)
a.mu.Unlock()
}
@@ -1611,7 +1646,7 @@ func (a *App) runScanWithTaskContext(ctx context.Context, driveID string) {
// spider91 / localupload 走自己的生命周期管理,不应该参与扫描清理;
// stats.Errors > 0 时(云盘 API 中途抖动)保守起见跳过这一轮,避免把
// "暂时列不出来"误认成"被用户删了"。
if drv.Kind() != spider91.Kind && drv.ID() != localupload.DriveID {
if drv.Kind() != spider91.Kind && drv.Kind() != scriptcrawler.Kind && drv.ID() != localupload.DriveID {
if stats.Errors > 0 {
log.Printf("[cleanup] skip stale cleanup for drive=%s kind=%s: scan had %d directory errors", driveID, drv.Kind(), stats.Errors)
} else {
@@ -1736,7 +1771,7 @@ func (a *App) spider91OriginFromVideo(ctx context.Context, v *catalog.Video) (st
if a == nil || v == nil {
return "", ""
}
if d, err := a.cat.GetDrive(ctx, v.DriveID); err == nil && d != nil && d.Kind == spider91.Kind {
if d, err := a.cat.GetDrive(ctx, v.DriveID); err == nil && d != nil && isSpider91SourceDrive(d) {
prefix := "spider91-" + d.ID + "-"
if strings.HasPrefix(v.ID, prefix) {
return d.ID, strings.TrimPrefix(v.ID, prefix)
@@ -1749,7 +1784,7 @@ func (a *App) spider91OriginFromVideo(ctx context.Context, v *catalog.Video) (st
bestDriveID := ""
bestSourceID := ""
for _, d := range drives {
if d == nil || d.Kind != spider91.Kind {
if d == nil || !isSpider91SourceDrive(d) {
continue
}
prefix := "spider91-" + d.ID + "-"
@@ -1839,7 +1874,7 @@ func (a *App) cleanupDriveVideosForDelete(ctx context.Context, driveID string) (
}
}
if strings.EqualFold(d.Kind, spider91.Kind) {
if isSpider91SourceDrive(d) {
if err := a.removeSpider91DriveDir(driveID); err != nil {
return 0, err
}
@@ -1924,7 +1959,7 @@ func (a *App) videosForDriveDelete(ctx context.Context, d *catalog.Drive) ([]*ca
byID[v.ID] = v
}
if strings.EqualFold(d.Kind, spider91.Kind) {
if isSpider91SourceDrive(d) {
prefix := "spider91-" + d.ID + "-"
originItems, err := a.cat.ListVideosByIDPrefix(ctx, prefix)
if err != nil {
@@ -2380,7 +2415,7 @@ func (a *App) regenFailedFingerprints(ctx context.Context, driveID string) {
}
// listScanTargetIDs 返回 nightly Phase 1 应扫描的所有 drive ID
// (非 spider91、非 localupload)。它直接读 catalog,而不是 registry,这样
// (非爬虫、非 localupload)。它直接读 catalog,而不是 registry,这样
// 进程刚启动、云盘还在后台挂载时,nightly 也不会漏掉配置过的 drive。
func (a *App) listScanTargetIDs(ctx context.Context) []string {
all, err := a.cat.ListDrives(ctx)
@@ -2390,7 +2425,7 @@ func (a *App) listScanTargetIDs(ctx context.Context) []string {
}
out := make([]string, 0, len(all))
for _, d := range all {
if d == nil || d.ID == localupload.DriveID || d.Kind == spider91.Kind {
if d == nil || d.ID == localupload.DriveID || d.Kind == spider91.Kind || d.Kind == scriptcrawler.Kind {
continue
}
out = append(out, d.ID)
@@ -2398,7 +2433,7 @@ func (a *App) listScanTargetIDs(ctx context.Context) []string {
return out
}
// listSpider91DriveIDs 返回 nightly Phase 2 应触发爬取的 spider91 drive ID 列表。
// listSpider91DriveIDs 返回 nightly Phase 2 应触发爬取的爬虫 drive ID 列表。
func (a *App) listSpider91DriveIDs(ctx context.Context) []string {
all, err := a.cat.ListDrives(ctx)
if err != nil {
@@ -2407,7 +2442,7 @@ func (a *App) listSpider91DriveIDs(ctx context.Context) []string {
}
out := make([]string, 0, len(all))
for _, d := range all {
if d != nil && d.Kind == spider91.Kind {
if d != nil && d.Kind == scriptcrawler.Kind {
out = append(out, d.ID)
}
}
@@ -2449,8 +2484,8 @@ func shouldScanDrive(d drives.Drive) bool {
if d == nil || d.ID() == localupload.DriveID {
return false
}
// spider91 由专用 crawlerLoop 触发,不参与 scanLoop
if d.Kind() == spider91.Kind {
// 爬虫类 drive 由专用 crawl 阶段触发,不参与普通 scan
if d.Kind() == spider91.Kind || d.Kind() == scriptcrawler.Kind {
return false
}
return true
@@ -2481,65 +2516,96 @@ func (a *App) scheduleSpider91Crawl(ctx context.Context, driveID string) bool {
return true
}
func (a *App) scheduleScriptCrawlerCrawl(ctx context.Context, driveID string) bool {
if a.driveHasActiveWork(driveID) {
log.Printf("[scriptcrawler] drive=%s has active work, skip duplicate crawl request", driveID)
return false
}
if !a.beginDriveScanOrCrawl(driveID) {
log.Printf("[scriptcrawler] drive=%s already queued or running, skip duplicate crawl request", driveID)
return false
}
taskCtx, done := a.registerDriveTaskContext(ctx, driveID)
go func() {
defer func() {
a.endDriveScanOrCrawl(driveID)
done()
}()
a.runScriptCrawlerCrawlWithTaskContext(taskCtx, driveID)
}()
return true
}
// runSpider91Crawl 运行一次完整爬取流程并把 last_crawl_at 写回 drive.credentials。
//
// 即使爬取失败也会更新 last_crawl_at,避免一直在错误循环里反复触发;下一次 nightly
// 流水线重跑时仍会重试。该方法是阻塞的,被 nightly Phase 2 串行调用,以及被
// admin "立即抓取" 单 drive 异步调用。
func (a *App) runSpider91Crawl(ctx context.Context, driveID string) {
a.runScriptCrawlerCrawl(ctx, driveID)
}
func (a *App) runScriptCrawlerCrawl(ctx context.Context, driveID string) {
if !a.beginDriveScanOrCrawl(driveID) {
log.Printf("[spider91] drive=%s already queued or running, skip direct crawl", driveID)
log.Printf("[scriptcrawler] drive=%s already queued or running, skip direct crawl", driveID)
return
}
defer a.endDriveScanOrCrawl(driveID)
taskCtx, done := a.registerDriveTaskContext(ctx, driveID)
defer done()
a.runSpider91CrawlWithTaskContext(taskCtx, driveID)
a.runScriptCrawlerCrawlWithTaskContext(taskCtx, driveID)
}
func (a *App) runSpider91CrawlWithTaskContext(ctx context.Context, driveID string) bool {
return a.runScriptCrawlerCrawlWithTaskContext(ctx, driveID)
}
func (a *App) runScriptCrawlerCrawlWithTaskContext(ctx context.Context, driveID string) bool {
if err := ctx.Err(); err != nil {
log.Printf("[spider91] drive=%s crawl canceled before start: %v", driveID, err)
log.Printf("[scriptcrawler] drive=%s crawl canceled before start: %v", driveID, err)
return false
}
a.mu.Lock()
c := a.spider91Crawlers[driveID]
c := a.scriptCrawlers[driveID]
a.mu.Unlock()
if c == nil {
if err := a.ensureDriveAttached(ctx, driveID); err != nil {
log.Printf("[spider91] drive=%s attach failed: %v", driveID, err)
log.Printf("[scriptcrawler] drive=%s attach failed: %v", driveID, err)
return false
}
a.mu.Lock()
c = a.spider91Crawlers[driveID]
c = a.scriptCrawlers[driveID]
a.mu.Unlock()
if c == nil {
log.Printf("[spider91] drive=%s crawler not attached", driveID)
log.Printf("[scriptcrawler] drive=%s crawler not attached", driveID)
return false
}
}
d, err := a.cat.GetDrive(ctx, driveID)
if err != nil || d == nil {
log.Printf("[spider91] drive=%s lookup failed: %v", driveID, err)
log.Printf("[scriptcrawler] drive=%s lookup failed: %v", driveID, err)
return false
}
targetNew := spider91IntCred(d, "target_new", spider91.DefaultTargetNew)
defaultTargetNew := scriptcrawler.DefaultTargetNew
if scriptCrawlerSourceKindForDrive(d) == spider91.Kind {
defaultTargetNew = spider91.DefaultTargetNew
}
targetNew := spider91IntCred(d, "target_new", defaultTargetNew)
if targetNew <= 0 {
targetNew = spider91.DefaultTargetNew
targetNew = defaultTargetNew
}
log.Printf("[spider91] drive=%s start crawl target_new=%d", driveID, targetNew)
log.Printf("[scriptcrawler] drive=%s start crawl target_new=%d", driveID, targetNew)
res, runErr := c.RunOnce(ctx, targetNew)
if runErr != nil {
log.Printf("[spider91] drive=%s crawl failed: %v", driveID, runErr)
log.Printf("[scriptcrawler] drive=%s crawl failed: %v", driveID, runErr)
} else if res != nil {
log.Printf("[spider91] drive=%s crawl done target=%d total=%d new=%d skipped=%d failed=%d seen_snapshot=%d",
log.Printf("[scriptcrawler] drive=%s crawl done target=%d total=%d new=%d skipped=%d failed=%d seen_snapshot=%d",
driveID, res.TargetNew, res.TotalEntries, res.NewVideos, res.Skipped, res.Failed, res.SeenSnapshot)
}
// 标记最后一次爬取时间。这字段已不再用于调度判定(nightly 流水线统一调度),
// 留着仅作为 admin UI 显示"上次抓取 N 小时前"用。
if d.Credentials == nil {
d.Credentials = make(map[string]string)
}
@@ -2552,18 +2618,13 @@ func (a *App) runSpider91CrawlWithTaskContext(ctx context.Context, driveID strin
d.LastError = ""
}
if err := a.cat.UpsertDrive(ctx, d); err != nil {
log.Printf("[spider91] drive=%s update last_crawl_at: %v", driveID, err)
log.Printf("[scriptcrawler] drive=%s update last_crawl_at: %v", driveID, err)
}
if err := ctx.Err(); err != nil {
log.Printf("[spider91] drive=%s crawl canceled after run: %v", driveID, err)
log.Printf("[scriptcrawler] drive=%s crawl canceled after run: %v", driveID, err)
return false
}
// 爬取全部完成后,统一把所有还 pending 的预览视频入队。
// 这是新流水线设计:crawler 自身不再每条入库就立即触发预览视频生成,
// 让"下载阶段"和"预览视频阶段"在时间上分清楚(也跟 nightly Phase 2
// 的"等预览视频队列 idle"语义对齐)。enqueueDriveGeneration 内部会读
// 该 drive 当前的 teaser_enabled,关闭时是 noop。
a.mu.Lock()
worker := a.workers[driveID]
thumbWorker := a.thumbWorkers[driveID]
+45 -5
View File
@@ -6,6 +6,7 @@ import (
"io"
"os"
"path/filepath"
"strings"
"sync"
"testing"
"time"
@@ -13,6 +14,7 @@ import (
"github.com/video-site/backend/internal/catalog"
"github.com/video-site/backend/internal/config"
"github.com/video-site/backend/internal/drives"
"github.com/video-site/backend/internal/drives/scriptcrawler"
"github.com/video-site/backend/internal/drives/spider91"
"github.com/video-site/backend/internal/fingerprint"
"github.com/video-site/backend/internal/preview"
@@ -606,7 +608,8 @@ func TestNightlyTargetsComeFromCatalogBeforeDriveAttach(t *testing.T) {
for _, d := range []*catalog.Drive{
{ID: "115", Kind: "p115", Name: "115", RootID: "0", TeaserEnabled: true},
{ID: "pikpak", Kind: "pikpak", Name: "PikPak", RootID: "0", TeaserEnabled: true},
{ID: "91-spider", Kind: "spider91", Name: "91 Spider", RootID: "0", TeaserEnabled: true},
{ID: "91-legacy", Kind: "spider91", Name: "91 Legacy", RootID: "0", TeaserEnabled: true},
{ID: "91-crawler", Kind: scriptcrawler.Kind, Name: "91 Spider", RootID: "/", TeaserEnabled: true},
} {
if err := cat.UpsertDrive(ctx, d); err != nil {
t.Fatalf("seed drive %s: %v", d.ID, err)
@@ -619,8 +622,47 @@ func TestNightlyTargetsComeFromCatalogBeforeDriveAttach(t *testing.T) {
t.Fatalf("scan target ids = %#v, want 115 and pikpak from catalog", scanIDs)
}
spiderIDs := app.listSpider91DriveIDs(ctx)
if len(spiderIDs) != 1 || spiderIDs[0] != "91-spider" {
t.Fatalf("spider91 ids = %#v, want catalog spider drive", spiderIDs)
if len(spiderIDs) != 1 || spiderIDs[0] != "91-crawler" {
t.Fatalf("spider91 ids = %#v, want crawler-page script drive", spiderIDs)
}
}
func TestAttachDriveRejectsLegacySpider91Storage(t *testing.T) {
ctx := context.Background()
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
if err != nil {
t.Fatalf("open catalog: %v", err)
}
t.Cleanup(func() {
if err := cat.Close(); err != nil {
t.Fatalf("close catalog: %v", err)
}
})
d := &catalog.Drive{
ID: "91-legacy",
Kind: spider91.Kind,
Name: "91 Legacy",
RootID: "/",
TeaserEnabled: true,
}
if err := cat.UpsertDrive(ctx, d); err != nil {
t.Fatalf("seed drive: %v", err)
}
app := &App{cat: cat, registry: proxy.NewRegistry()}
err = app.attachDrive(ctx, d)
if err == nil || !strings.Contains(err.Error(), "爬虫管理") {
t.Fatalf("attach err = %v, want crawler management guidance", err)
}
if _, ok := app.registry.Get(d.ID); ok {
t.Fatal("legacy spider91 drive should not be registered")
}
got, err := cat.GetDrive(ctx, d.ID)
if err != nil {
t.Fatalf("get drive: %v", err)
}
if got.Status != "error" || !strings.Contains(got.LastError, "爬虫管理") {
t.Fatalf("status/error = %q/%q, want deprecated error", got.Status, got.LastError)
}
}
@@ -1033,7 +1075,6 @@ func TestCleanupDriveVideosForDeleteRemovesRowsAndGeneratedAssetsOnly(t *testing
workers: make(map[string]*preview.Worker),
thumbWorkers: make(map[string]*preview.ThumbWorker),
fingerprintWorkers: make(map[string]*fingerprint.Worker),
spider91Crawlers: make(map[string]*spider91.Crawler),
}
removed, err := app.cleanupDriveVideosForDelete(ctx, "local-main")
if err != nil {
@@ -1313,7 +1354,6 @@ func TestCleanupDriveVideosForDeleteSpider91RemovesCrawledDirAndOriginRecords(t
workers: make(map[string]*preview.Worker),
thumbWorkers: make(map[string]*preview.ThumbWorker),
fingerprintWorkers: make(map[string]*fingerprint.Worker),
spider91Crawlers: make(map[string]*spider91.Crawler),
}
removed, err := app.cleanupDriveVideosForDelete(ctx, driveID)
if err != nil {
+527 -3
View File
@@ -10,6 +10,8 @@ import (
"net/http"
"net/url"
"os"
"path"
"path/filepath"
"strconv"
"strings"
"time"
@@ -19,6 +21,7 @@ import (
"github.com/video-site/backend/internal/auth"
"github.com/video-site/backend/internal/catalog"
"github.com/video-site/backend/internal/drives/p123"
"github.com/video-site/backend/internal/drives/scriptcrawler"
)
type AdminServer struct {
@@ -65,6 +68,9 @@ type AdminServer struct {
// Spider91 → 115/123/PikPak/OneDrive 上传目标 drive ID 读写
GetSpider91UploadDriveID func() string
SetSpider91UploadDriveID func(driveID string) error
// DefaultSpider91ScriptPath returns the built-in Spider91 crawler script
// path for the independent crawler management UI.
DefaultSpider91ScriptPath func() string
// OnRunNightlyJob 触发一次完整的凌晨流水线(Phase1 扫盘 + Phase2 91 爬虫 +
// Phase3 迁移)。立即返回 —— 实际任务在后台跑,admin 在日志或下次状态查询里
// 看进度。若流水线正在跑或已排队,Runner 会拒绝重复触发。
@@ -116,6 +122,8 @@ type NightlyJobStatus struct {
LastFinishedAt string `json:"lastFinishedAt,omitempty"`
}
const maxCrawlerScriptBytes = 2 * 1024 * 1024
type DeleteVideoResult struct {
OK bool `json:"ok"`
DeletedSource bool `json:"deletedSource"`
@@ -150,6 +158,15 @@ func (a *AdminServer) Register(r chi.Router) {
r.Post("/drives/{id}/thumbnails/failed/regenerate", a.handleRegenFailedThumbnails)
r.Post("/drives/{id}/fingerprints/failed/regenerate", a.handleRegenFailedFingerprints)
// 爬虫
r.Get("/crawlers", a.handleListCrawlers)
r.Post("/crawlers", a.handleUpsertCrawler)
r.Post("/crawlers/import-file", a.handleImportCrawlerScriptFile)
r.Post("/crawlers/import-url", a.handleImportCrawlerScriptURL)
r.Delete("/crawlers/{id}", a.handleDeleteCrawler)
r.Post("/crawlers/{id}/run", a.handleRunCrawler)
r.Post("/crawlers/{id}/tasks/stop", a.handleStopCrawlerTasks)
// 视频
r.Get("/videos", a.handleAdminListVideos)
r.Put("/videos/{id}", a.handleUpdateVideo)
@@ -424,6 +441,11 @@ func (a *AdminServer) handleListDrives(w http.ResponseWriter, r *http.Request) {
// LastCrawlAt 是 spider91 上次成功爬取的 unix 秒(来自 credentials.last_crawl_at)。
// 其它 kind 留 0;前端用它显示"上次抓取: N 小时前"。
Spider91Proxy string `json:"spider91Proxy,omitempty"`
ScriptCrawlerPythonPath string `json:"scriptCrawlerPythonPath,omitempty"`
ScriptCrawlerScriptPath string `json:"scriptCrawlerScriptPath,omitempty"`
ScriptCrawlerProxy string `json:"scriptCrawlerProxy,omitempty"`
ScriptCrawlerTargetNew string `json:"scriptCrawlerTargetNew,omitempty"`
ScriptCrawlerConfigJSON string `json:"scriptCrawlerConfigJson,omitempty"`
LastCrawlAt int64 `json:"lastCrawlAt,omitempty"`
GoogleDriveUseOnlineAPI *bool `json:"googleDriveUseOnlineAPI,omitempty"`
ScanGenerationStatus GenerationStatus `json:"scanGenerationStatus"`
@@ -443,6 +465,9 @@ func (a *AdminServer) handleListDrives(w http.ResponseWriter, r *http.Request) {
}
list := make([]out, 0, len(drives))
for _, d := range drives {
if isCrawlerDriveKind(d.Kind) {
continue
}
counts := teaserCounts[d.ID]
thumbCounts := thumbnailCounts[d.ID]
fingerprintCount := fingerprintCounts[d.ID]
@@ -488,6 +513,11 @@ func (a *AdminServer) handleListDrives(w http.ResponseWriter, r *http.Request) {
TeaserEnabled: d.TeaserEnabled,
SkipDirIDs: append([]string{}, d.SkipDirIDs...),
Spider91Proxy: spider91ProxyForDrive(d),
ScriptCrawlerPythonPath: scriptCrawlerCred(d, "python_path"),
ScriptCrawlerScriptPath: scriptCrawlerCred(d, "script_path"),
ScriptCrawlerProxy: scriptCrawlerCred(d, "proxy"),
ScriptCrawlerTargetNew: scriptCrawlerCred(d, "target_new"),
ScriptCrawlerConfigJSON: scriptCrawlerCred(d, "config_json"),
LastCrawlAt: lastCrawlAt,
GoogleDriveUseOnlineAPI: googleDriveUseOnlineAPIForDrive(d),
ScanGenerationStatus: generation.Scan,
@@ -543,7 +573,10 @@ func (a *AdminServer) handleUpsertDrive(w http.ResponseWriter, r *http.Request)
existing = existingDrive
}
if body.Kind == "spider91" {
credentials, err := mergeSpider91Credentials(existing, body.Credentials)
http.Error(w, "91Spider 已不再支持通过网盘添加,请在爬虫管理页面添加爬虫脚本", http.StatusBadRequest)
return
} else if body.Kind == scriptcrawler.Kind {
credentials, err := mergeScriptCrawlerCredentials(existing, body.Credentials)
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
@@ -600,6 +633,421 @@ func (a *AdminServer) handleUpsertDrive(w http.ResponseWriter, r *http.Request)
writeJSON(w, http.StatusOK, map[string]any{"ok": true})
}
type crawlerDTO struct {
ID string `json:"id"`
Name string `json:"name"`
Kind string `json:"kind"`
Builtin string `json:"builtin,omitempty"`
Status string `json:"status"`
LastError string `json:"lastError,omitempty"`
ScriptPath string `json:"scriptPath"`
PythonPath string `json:"pythonPath,omitempty"`
Proxy string `json:"proxy,omitempty"`
TargetNew string `json:"targetNew,omitempty"`
ConfigJSON string `json:"configJson,omitempty"`
LastCrawlAt int64 `json:"lastCrawlAt,omitempty"`
ScanGenerationStatus GenerationStatus `json:"scanGenerationStatus"`
ThumbnailGenerationStatus GenerationStatus `json:"thumbnailGenerationStatus"`
PreviewGenerationStatus GenerationStatus `json:"previewGenerationStatus"`
FingerprintGenerationStatus GenerationStatus `json:"fingerprintGenerationStatus"`
ThumbnailReadyCount int `json:"thumbnailReadyCount"`
ThumbnailPendingCount int `json:"thumbnailPendingCount"`
ThumbnailFailedCount int `json:"thumbnailFailedCount"`
TeaserReadyCount int `json:"teaserReadyCount"`
TeaserPendingCount int `json:"teaserPendingCount"`
TeaserFailedCount int `json:"teaserFailedCount"`
FingerprintReadyCount int `json:"fingerprintReadyCount"`
FingerprintPendingCount int `json:"fingerprintPendingCount"`
FingerprintFailedCount int `json:"fingerprintFailedCount"`
}
type upsertCrawlerReq struct {
ID string `json:"id"`
Name string `json:"name"`
Builtin string `json:"builtin"`
ScriptPath string `json:"scriptPath"`
PythonPath string `json:"pythonPath"`
Proxy string `json:"proxy"`
TargetNew string `json:"targetNew"`
ConfigJSON string `json:"configJson"`
}
func (a *AdminServer) handleListCrawlers(w http.ResponseWriter, r *http.Request) {
all, err := a.Catalog.ListDrives(r.Context())
if err != nil {
writeErr(w, http.StatusInternalServerError, err)
return
}
teaserCounts, err := a.Catalog.CountTeasersByDrive(r.Context())
if err != nil {
writeErr(w, http.StatusInternalServerError, err)
return
}
thumbnailCounts, err := a.Catalog.CountThumbnailsByDrive(r.Context())
if err != nil {
writeErr(w, http.StatusInternalServerError, err)
return
}
fingerprintCounts, err := a.Catalog.CountFingerprintsByDrive(r.Context())
if err != nil {
writeErr(w, http.StatusInternalServerError, err)
return
}
generationStatuses := map[string]DriveGenerationStatuses{}
if a.GetDriveGenerationStatuses != nil {
generationStatuses = a.GetDriveGenerationStatuses()
}
out := []crawlerDTO{}
for _, d := range all {
if d == nil || !isCrawlerDriveKind(d.Kind) {
continue
}
out = append(out, a.crawlerDTOForDrive(d, teaserCounts[d.ID], thumbnailCounts[d.ID], fingerprintCounts[d.ID], generationStatuses[d.ID]))
}
writeJSON(w, http.StatusOK, out)
}
func (a *AdminServer) crawlerDTOForDrive(d *catalog.Drive, teaser catalog.DriveTeaserCounts, thumb catalog.DriveThumbnailCounts, fp catalog.DriveFingerprintCounts, generation DriveGenerationStatuses) crawlerDTO {
if generation.Scan.State == "" {
generation.Scan.State = "idle"
}
if generation.Thumbnail.State == "" {
generation.Thumbnail.State = "idle"
}
if generation.Preview.State == "" {
generation.Preview.State = "idle"
}
if generation.Fingerprint.State == "" {
generation.Fingerprint.State = "idle"
}
lastCrawlAt := int64(0)
if raw := strings.TrimSpace(d.Credentials["last_crawl_at"]); raw != "" {
if v, err := strconv.ParseInt(raw, 10, 64); err == nil {
lastCrawlAt = v
}
}
return crawlerDTO{
ID: d.ID,
Name: d.Name,
Kind: d.Kind,
Builtin: crawlerBuiltinForDrive(d),
Status: d.Status,
LastError: d.LastError,
ScriptPath: strings.TrimSpace(d.Credentials["script_path"]),
PythonPath: strings.TrimSpace(d.Credentials["python_path"]),
Proxy: strings.TrimSpace(d.Credentials["proxy"]),
TargetNew: strings.TrimSpace(d.Credentials["target_new"]),
ConfigJSON: strings.TrimSpace(d.Credentials["config_json"]),
LastCrawlAt: lastCrawlAt,
ScanGenerationStatus: generation.Scan,
ThumbnailGenerationStatus: generation.Thumbnail,
PreviewGenerationStatus: generation.Preview,
FingerprintGenerationStatus: generation.Fingerprint,
ThumbnailReadyCount: thumb.Ready,
ThumbnailPendingCount: thumb.Pending,
ThumbnailFailedCount: thumb.Failed,
TeaserReadyCount: teaser.Ready,
TeaserPendingCount: teaser.Pending,
TeaserFailedCount: teaser.Failed,
FingerprintReadyCount: fp.Ready,
FingerprintPendingCount: fp.Pending,
FingerprintFailedCount: fp.Failed,
}
}
func crawlerBuiltinForDrive(d *catalog.Drive) string {
if d == nil {
return ""
}
return strings.TrimSpace(d.Credentials["builtin"])
}
func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request) {
var body upsertCrawlerReq
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
writeErr(w, http.StatusBadRequest, err)
return
}
id := strings.TrimSpace(body.ID)
name := strings.TrimSpace(body.Name)
if id == "" || name == "" {
http.Error(w, "id and name are required", http.StatusBadRequest)
return
}
existing, _ := a.Catalog.GetDrive(r.Context(), id)
creds := map[string]string{}
if existing != nil {
for k, v := range existing.Credentials {
creds[k] = v
}
}
builtin := strings.TrimSpace(body.Builtin)
if builtin != "" {
creds["builtin"] = builtin
}
scriptPath := strings.TrimSpace(body.ScriptPath)
if scriptPath == "" && builtin == "spider91" && a.DefaultSpider91ScriptPath != nil {
scriptPath = strings.TrimSpace(a.DefaultSpider91ScriptPath())
}
incoming := map[string]string{
"script_path": scriptPath,
"python_path": strings.TrimSpace(body.PythonPath),
"proxy": strings.TrimSpace(body.Proxy),
"target_new": strings.TrimSpace(body.TargetNew),
"config_json": strings.TrimSpace(body.ConfigJSON),
}
for k, v := range incoming {
creds[k] = v
}
merged, err := mergeScriptCrawlerCredentials(existing, creds)
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
if builtin != "" {
merged["builtin"] = builtin
}
d := &catalog.Drive{
ID: id,
Kind: scriptcrawler.Kind,
Name: name,
RootID: "/",
Credentials: merged,
Status: "disconnected",
TeaserEnabled: true,
}
if existing != nil {
d.TeaserEnabled = existing.TeaserEnabled
}
if err := a.Catalog.UpsertDrive(r.Context(), d); err != nil {
writeErr(w, http.StatusInternalServerError, err)
return
}
if a.OnDriveSaved != nil {
if err := a.OnDriveSaved(id); err != nil {
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "warning": err.Error()})
return
}
}
writeJSON(w, http.StatusOK, map[string]any{"ok": true})
}
type importCrawlerScriptURLReq struct {
URL string `json:"url"`
FileName string `json:"fileName"`
}
func (a *AdminServer) handleImportCrawlerScriptFile(w http.ResponseWriter, r *http.Request) {
r.Body = http.MaxBytesReader(w, r.Body, maxCrawlerScriptBytes+1024*1024)
if err := r.ParseMultipartForm(maxCrawlerScriptBytes + 1024*1024); err != nil {
writeErr(w, http.StatusBadRequest, err)
return
}
file, header, err := r.FormFile("file")
if err != nil {
writeErr(w, http.StatusBadRequest, errors.New("file is required"))
return
}
defer file.Close()
name := "crawler.py"
if header != nil && strings.TrimSpace(header.Filename) != "" {
name = header.Filename
}
scriptPath, err := a.saveCrawlerScript(r.Context(), name, file, maxCrawlerScriptBytes)
if err != nil {
writeErr(w, http.StatusBadRequest, err)
return
}
writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath})
}
func (a *AdminServer) handleImportCrawlerScriptURL(w http.ResponseWriter, r *http.Request) {
var body importCrawlerScriptURLReq
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
writeErr(w, http.StatusBadRequest, err)
return
}
rawURL := strings.TrimSpace(body.URL)
u, err := url.Parse(rawURL)
if err != nil || u.Scheme == "" || u.Host == "" {
writeErr(w, http.StatusBadRequest, errors.New("脚本链接格式无效"))
return
}
if u.Scheme != "http" && u.Scheme != "https" {
writeErr(w, http.StatusBadRequest, errors.New("脚本链接仅支持 http:// 或 https://"))
return
}
client := a.HTTPClient
if client == nil {
client = &http.Client{Timeout: 30 * time.Second}
}
req, err := http.NewRequestWithContext(r.Context(), http.MethodGet, u.String(), nil)
if err != nil {
writeErr(w, http.StatusBadRequest, err)
return
}
req.Header.Set("User-Agent", "video-site-crawler-import/1.0")
resp, err := client.Do(req)
if err != nil {
writeErr(w, http.StatusBadGateway, err)
return
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
writeErr(w, http.StatusBadGateway, fmt.Errorf("下载脚本失败: HTTP %d", resp.StatusCode))
return
}
if resp.ContentLength > maxCrawlerScriptBytes {
writeErr(w, http.StatusBadRequest, fmt.Errorf("脚本文件不能超过 %d KiB", maxCrawlerScriptBytes/1024))
return
}
name := strings.TrimSpace(body.FileName)
if name == "" {
name = path.Base(u.Path)
}
if name == "." || name == "/" || name == "" {
name = "crawler.py"
}
scriptPath, err := a.saveCrawlerScript(r.Context(), name, resp.Body, maxCrawlerScriptBytes)
if err != nil {
writeErr(w, http.StatusBadRequest, err)
return
}
writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath})
}
func (a *AdminServer) saveCrawlerScript(ctx context.Context, name string, r io.Reader, maxBytes int64) (string, error) {
if err := ctx.Err(); err != nil {
return "", err
}
fileName, err := safeCrawlerScriptFileName(name)
if err != nil {
return "", err
}
root, err := a.crawlerScriptImportDir()
if err != nil {
return "", err
}
if err := os.MkdirAll(root, 0o755); err != nil {
return "", err
}
dst := filepath.Join(root, time.Now().UTC().Format("20060102T150405.000000000Z")+"-"+fileName)
dstAbs, err := filepath.Abs(dst)
if err != nil {
return "", err
}
rootAbs, err := filepath.Abs(root)
if err != nil {
return "", err
}
if dstAbs != rootAbs && !strings.HasPrefix(dstAbs, rootAbs+string(os.PathSeparator)) {
return "", errors.New("invalid crawler script path")
}
tmp := dstAbs + ".part"
out, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o644)
if err != nil {
return "", err
}
limited := io.LimitReader(r, maxBytes+1)
written, copyErr := io.Copy(out, limited)
closeErr := out.Close()
if copyErr != nil {
_ = os.Remove(tmp)
return "", copyErr
}
if closeErr != nil {
_ = os.Remove(tmp)
return "", closeErr
}
if written <= 0 {
_ = os.Remove(tmp)
return "", errors.New("脚本文件为空")
}
if written > maxBytes {
_ = os.Remove(tmp)
return "", fmt.Errorf("脚本文件不能超过 %d KiB", maxBytes/1024)
}
if err := os.Rename(tmp, dstAbs); err != nil {
_ = os.Remove(tmp)
return "", err
}
return dstAbs, nil
}
func (a *AdminServer) crawlerScriptImportDir() (string, error) {
base := strings.TrimSpace(a.LocalPreviewDir)
if base == "" {
base = filepath.Join(".", "data", "previews")
}
root := filepath.Join(filepath.Dir(base), "crawler-scripts")
return filepath.Abs(root)
}
func safeCrawlerScriptFileName(raw string) (string, error) {
name := strings.TrimSpace(filepath.Base(raw))
if name == "" || name == "." || name == string(os.PathSeparator) {
name = "crawler.py"
}
ext := strings.ToLower(filepath.Ext(name))
if ext != ".py" {
return "", errors.New("目前只支持导入 .py 爬虫脚本")
}
stem := strings.TrimSuffix(name, filepath.Ext(name))
var b strings.Builder
for _, r := range stem {
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '-' || r == '_' || r == '.' {
b.WriteRune(r)
} else {
b.WriteByte('_')
}
}
cleanStem := strings.Trim(b.String(), "._-")
if cleanStem == "" {
cleanStem = "crawler"
}
return cleanStem + ".py", nil
}
func (a *AdminServer) handleRunCrawler(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
status := a.nightlyJobStatus()
if status.Running || status.Queued {
writeJSON(w, http.StatusAccepted, map[string]any{
"ok": true,
"accepted": false,
"message": fullScanBusyMessage,
"status": status,
})
return
}
accepted := true
if a.OnScanRequested != nil {
accepted = a.OnScanRequested(id)
}
resp := map[string]any{"ok": true, "accepted": accepted}
if !accepted {
resp["message"] = driveTaskBusyMessage
}
writeJSON(w, http.StatusAccepted, resp)
}
func (a *AdminServer) handleStopCrawlerTasks(w http.ResponseWriter, r *http.Request) {
a.handleStopDriveTasks(w, r)
}
func (a *AdminServer) handleDeleteCrawler(w http.ResponseWriter, r *http.Request) {
a.handleDeleteDrive(w, r)
}
func isCrawlerDriveKind(kind string) bool {
return kind == scriptcrawler.Kind
}
func spider91ProxyForDrive(d *catalog.Drive) string {
if d == nil || d.Kind != "spider91" || d.Credentials == nil {
return ""
@@ -607,6 +1055,13 @@ func spider91ProxyForDrive(d *catalog.Drive) string {
return strings.TrimSpace(d.Credentials["proxy"])
}
func scriptCrawlerCred(d *catalog.Drive, key string) string {
if d == nil || d.Kind != scriptcrawler.Kind || d.Credentials == nil {
return ""
}
return strings.TrimSpace(d.Credentials[key])
}
func googleDriveUseOnlineAPIForDrive(d *catalog.Drive) *bool {
if d == nil || d.Kind != "googledrive" {
return nil
@@ -676,20 +1131,89 @@ func mergeSpider91Credentials(existing *catalog.Drive, incoming map[string]strin
return merged, nil
}
func mergeScriptCrawlerCredentials(existing *catalog.Drive, incoming map[string]string) (map[string]string, error) {
merged := map[string]string{}
if existing != nil {
for k, v := range existing.Credentials {
merged[k] = v
}
}
for k, v := range incoming {
key := strings.TrimSpace(k)
if key == "" {
continue
}
value := strings.TrimSpace(v)
switch key {
case "proxy":
proxy, err := normalizeCrawlerProxyURL(value, "脚本爬虫")
if err != nil {
return nil, err
}
if proxy == "" {
delete(merged, key)
} else {
merged[key] = proxy
}
case "target_new":
if value == "" {
delete(merged, key)
continue
}
n, err := strconv.Atoi(value)
if err != nil || n <= 0 {
return nil, fmt.Errorf("脚本爬虫 target_new 必须是正整数")
}
merged[key] = strconv.Itoa(n)
case "config_json":
if value == "" {
delete(merged, key)
continue
}
if !json.Valid([]byte(value)) {
return nil, fmt.Errorf("脚本爬虫自定义配置必须是合法 JSON")
}
merged[key] = value
case "python_path", "script_path":
if value == "" {
if existing == nil || key == "script_path" {
delete(merged, key)
}
continue
}
merged[key] = value
default:
if value == "" {
delete(merged, key)
} else {
merged[key] = value
}
}
}
if strings.TrimSpace(merged["script_path"]) == "" && !strings.EqualFold(strings.TrimSpace(merged["builtin"]), "spider91") {
return nil, fmt.Errorf("脚本爬虫必须填写 script_path")
}
return merged, nil
}
func normalizeSpider91ProxyURL(raw string) (string, error) {
return normalizeCrawlerProxyURL(raw, "91Spider")
}
func normalizeCrawlerProxyURL(raw, label string) (string, error) {
proxy := strings.TrimSpace(raw)
if proxy == "" {
return "", nil
}
u, err := url.Parse(proxy)
if err != nil || u.Scheme == "" || u.Host == "" {
return "", fmt.Errorf("91Spider 代理地址格式无效,请填写类似 http://127.0.0.1:7890 的地址")
return "", fmt.Errorf("%s 代理地址格式无效,请填写类似 http://127.0.0.1:7890 的地址", label)
}
switch strings.ToLower(u.Scheme) {
case "http", "https", "socks5", "socks5h":
return proxy, nil
default:
return "", fmt.Errorf("91Spider 代理地址仅支持 http://、https://、socks5:// 或 socks5h://")
return "", fmt.Errorf("%s 代理地址仅支持 http://、https://、socks5:// 或 socks5h://", label)
}
}
+252 -75
View File
@@ -5,6 +5,7 @@ import (
"context"
"database/sql"
"encoding/json"
"mime/multipart"
"net/http"
"net/http/httptest"
"os"
@@ -672,7 +673,7 @@ func TestHandleUpsertGoogleDriveMergesOAuthCredentials(t *testing.T) {
}
}
func TestHandleUpsertSpider91ProxyPreservesRuntimeCredentials(t *testing.T) {
func TestHandleUpsertSpider91DriveIsRejected(t *testing.T) {
ctx := context.Background()
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
if err != nil {
@@ -708,16 +709,19 @@ func TestHandleUpsertSpider91ProxyPreservesRuntimeCredentials(t *testing.T) {
}`))
rr := httptest.NewRecorder()
(&AdminServer{Catalog: cat}).handleUpsertDrive(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
if rr.Code != http.StatusBadRequest {
t.Fatalf("status = %d, want 400; body = %s", rr.Code, rr.Body.String())
}
if !strings.Contains(rr.Body.String(), "爬虫管理") {
t.Fatalf("body = %q, want crawler management guidance", rr.Body.String())
}
got, err := cat.GetDrive(ctx, "spider91-main")
if err != nil {
t.Fatalf("get drive: %v", err)
}
if got.Credentials["proxy"] != "socks5h://proxy-user:proxy-pass@127.0.0.1:7891" {
t.Fatalf("proxy = %q, want trimmed new proxy", got.Credentials["proxy"])
if got.Credentials["proxy"] != "http://old-proxy.local:7890" {
t.Fatalf("proxy = %q, want unchanged old proxy", got.Credentials["proxy"])
}
if got.Credentials["last_crawl_at"] != "1800000000" {
t.Fatalf("last_crawl_at = %q, want preserved", got.Credentials["last_crawl_at"])
@@ -725,59 +729,6 @@ func TestHandleUpsertSpider91ProxyPreservesRuntimeCredentials(t *testing.T) {
if got.Credentials["script_path"] == "" {
t.Fatalf("script_path should be preserved")
}
req = httptest.NewRequest(http.MethodPost, "/admin/api/drives", strings.NewReader(`{
"id": "spider91-main",
"kind": "spider91",
"name": "91 Spider",
"rootId": "/",
"credentials": {"proxy": " "}
}`))
rr = httptest.NewRecorder()
(&AdminServer{Catalog: cat}).handleUpsertDrive(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("clear status = %d, body = %s", rr.Code, rr.Body.String())
}
got, err = cat.GetDrive(ctx, "spider91-main")
if err != nil {
t.Fatalf("get cleared drive: %v", err)
}
if _, ok := got.Credentials["proxy"]; ok {
t.Fatalf("proxy should be removed after empty save, got %q", got.Credentials["proxy"])
}
if got.Credentials["last_crawl_at"] != "1800000000" {
t.Fatalf("last_crawl_at after clear = %q, want preserved", got.Credentials["last_crawl_at"])
}
}
func TestHandleUpsertSpider91RejectsUnsupportedProxyScheme(t *testing.T) {
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
if err != nil {
t.Fatalf("open catalog: %v", err)
}
t.Cleanup(func() {
if err := cat.Close(); err != nil {
t.Fatalf("close catalog: %v", err)
}
})
req := httptest.NewRequest(http.MethodPost, "/admin/api/drives", strings.NewReader(`{
"id": "spider91-main",
"kind": "spider91",
"name": "91 Spider",
"rootId": "/",
"credentials": {"proxy": "ftp://127.0.0.1:21"}
}`))
rr := httptest.NewRecorder()
(&AdminServer{Catalog: cat}).handleUpsertDrive(rr, req)
if rr.Code != http.StatusBadRequest {
t.Fatalf("status = %d, want 400; body = %s", rr.Code, rr.Body.String())
}
if !strings.Contains(rr.Body.String(), "socks5:// 或 socks5h://") {
t.Fatalf("body = %q, want supported schemes message", rr.Body.String())
}
}
func TestHandleDeleteDriveRunsRequestedCleanupBeforeDeletingDrive(t *testing.T) {
@@ -890,7 +841,7 @@ func TestHandleDeleteDriveRequiresCleanupConfirmation(t *testing.T) {
}
}
func TestHandleListDrivesIncludesSpider91Proxy(t *testing.T) {
func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
ctx := context.Background()
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
if err != nil {
@@ -911,6 +862,20 @@ func TestHandleListDrivesIncludesSpider91Proxy(t *testing.T) {
Credentials: map[string]string{
"last_crawl_at": "1800000000",
"proxy": " http://127.0.0.1:7890 ",
"script_path": "/opt/video-site-91/91VideoSpider/spider_91porn.py",
},
Status: "ok",
},
{
ID: "crawler-spider91",
Kind: "scriptcrawler",
Name: "91 Spider",
RootID: "/",
Credentials: map[string]string{
"builtin": "spider91",
"last_crawl_at": "1800000000",
"proxy": " http://127.0.0.1:7890 ",
"script_path": "/opt/video-site-91/91VideoSpider/spider_91porn.py",
},
Status: "ok",
},
@@ -930,39 +895,251 @@ func TestHandleListDrivesIncludesSpider91Proxy(t *testing.T) {
}
}
req := httptest.NewRequest(http.MethodGet, "/admin/api/drives", nil)
req := httptest.NewRequest(http.MethodGet, "/admin/api/crawlers", nil)
rr := httptest.NewRecorder()
(&AdminServer{Catalog: cat}).handleListDrives(rr, req)
srv := &AdminServer{Catalog: cat}
srv.handleListCrawlers(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
}
var got []struct {
ID string `json:"id"`
Spider91Proxy string `json:"spider91Proxy"`
LastCrawlAt int64 `json:"lastCrawlAt"`
ID string `json:"id"`
Kind string `json:"kind"`
Builtin string `json:"builtin"`
Proxy string `json:"proxy"`
LastCrawlAt int64 `json:"lastCrawlAt"`
}
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
t.Fatalf("decode: %v", err)
}
byID := map[string]struct {
Spider91Proxy string
LastCrawlAt int64
Kind string
Builtin string
Proxy string
LastCrawlAt int64
}{}
for _, d := range got {
byID[d.ID] = struct {
Spider91Proxy string
LastCrawlAt int64
}{Spider91Proxy: d.Spider91Proxy, LastCrawlAt: d.LastCrawlAt}
Kind string
Builtin string
Proxy string
LastCrawlAt int64
}{Kind: d.Kind, Builtin: d.Builtin, Proxy: d.Proxy, LastCrawlAt: d.LastCrawlAt}
}
if byID["spider91-main"].Spider91Proxy != "http://127.0.0.1:7890" {
t.Fatalf("spider91 proxy = %q, want trimmed proxy", byID["spider91-main"].Spider91Proxy)
if _, ok := byID["spider91-main"]; ok {
t.Fatal("legacy spider91 drive should not be returned by crawler list")
}
if byID["spider91-main"].LastCrawlAt != 1800000000 {
t.Fatalf("lastCrawlAt = %d, want 1800000000", byID["spider91-main"].LastCrawlAt)
if byID["crawler-spider91"].Kind != "scriptcrawler" || byID["crawler-spider91"].Builtin != "spider91" {
t.Fatalf("crawler kind/builtin = %q/%q, want scriptcrawler/spider91", byID["crawler-spider91"].Kind, byID["crawler-spider91"].Builtin)
}
if byID["onedrive-main"].Spider91Proxy != "" {
t.Fatalf("onedrive spider91Proxy = %q, want empty", byID["onedrive-main"].Spider91Proxy)
if byID["crawler-spider91"].Proxy != "http://127.0.0.1:7890" {
t.Fatalf("crawler proxy = %q, want trimmed proxy", byID["crawler-spider91"].Proxy)
}
if byID["crawler-spider91"].LastCrawlAt != 1800000000 {
t.Fatalf("lastCrawlAt = %d, want 1800000000", byID["crawler-spider91"].LastCrawlAt)
}
if _, ok := byID["onedrive-main"]; ok {
t.Fatal("onedrive should not be returned by crawler list")
}
driveReq := httptest.NewRequest(http.MethodGet, "/admin/api/drives", nil)
driveRR := httptest.NewRecorder()
srv.handleListDrives(driveRR, driveReq)
if driveRR.Code != http.StatusOK {
t.Fatalf("drive status = %d, body = %s", driveRR.Code, driveRR.Body.String())
}
var drives []struct {
ID string `json:"id"`
}
if err := json.NewDecoder(driveRR.Body).Decode(&drives); err != nil {
t.Fatalf("decode drives: %v", err)
}
driveIDs := map[string]bool{}
for _, d := range drives {
driveIDs[d.ID] = true
}
if !driveIDs["spider91-main"] {
t.Fatal("legacy spider91 drive should remain visible in drive list for deletion")
}
if driveIDs["crawler-spider91"] {
t.Fatal("scriptcrawler should not be returned by drive list")
}
}
func TestHandleUpsertCrawlerAllowsBuiltinSpider91WithoutScriptPath(t *testing.T) {
ctx := context.Background()
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
if err != nil {
t.Fatalf("open catalog: %v", err)
}
t.Cleanup(func() {
if err := cat.Close(); err != nil {
t.Fatalf("close catalog: %v", err)
}
})
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers", strings.NewReader(`{
"id": "spider91-main",
"name": "91 Spider",
"builtin": "spider91",
"scriptPath": "",
"pythonPath": "python3",
"targetNew": "15"
}`))
rr := httptest.NewRecorder()
(&AdminServer{
Catalog: cat,
DefaultSpider91ScriptPath: func() string {
return ""
},
}).handleUpsertCrawler(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
}
got, err := cat.GetDrive(ctx, "spider91-main")
if err != nil {
t.Fatalf("get crawler drive: %v", err)
}
if got.Kind != "scriptcrawler" || got.Credentials["builtin"] != "spider91" {
t.Fatalf("kind/builtin = %q/%q, want scriptcrawler/spider91", got.Kind, got.Credentials["builtin"])
}
if got.Credentials["script_path"] != "" {
t.Fatalf("script_path = %q, want empty when default is unavailable", got.Credentials["script_path"])
}
}
func TestHandleImportCrawlerScriptFile(t *testing.T) {
tmp := t.TempDir()
var body bytes.Buffer
mw := multipart.NewWriter(&body)
part, err := mw.CreateFormFile("file", "../demo crawler.py")
if err != nil {
t.Fatalf("create form file: %v", err)
}
if _, err := part.Write([]byte("print('crawler')\n")); err != nil {
t.Fatalf("write part: %v", err)
}
if err := mw.Close(); err != nil {
t.Fatalf("close multipart: %v", err)
}
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/import-file", &body)
req.Header.Set("Content-Type", mw.FormDataContentType())
rr := httptest.NewRecorder()
(&AdminServer{LocalPreviewDir: filepath.Join(tmp, "previews")}).handleImportCrawlerScriptFile(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
}
var got struct {
ScriptPath string `json:"scriptPath"`
}
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
t.Fatalf("decode: %v", err)
}
wantRoot := filepath.Join(tmp, "crawler-scripts")
if !strings.HasPrefix(got.ScriptPath, wantRoot+string(os.PathSeparator)) {
t.Fatalf("script path = %q, want under %q", got.ScriptPath, wantRoot)
}
if filepath.Ext(got.ScriptPath) != ".py" {
t.Fatalf("script path = %q, want .py", got.ScriptPath)
}
data, err := os.ReadFile(got.ScriptPath)
if err != nil {
t.Fatalf("read imported script: %v", err)
}
if string(data) != "print('crawler')\n" {
t.Fatalf("script content = %q", string(data))
}
}
func TestHandleImportCrawlerScriptFileRejectsNonPython(t *testing.T) {
tmp := t.TempDir()
var body bytes.Buffer
mw := multipart.NewWriter(&body)
part, err := mw.CreateFormFile("file", "crawler.txt")
if err != nil {
t.Fatalf("create form file: %v", err)
}
if _, err := part.Write([]byte("print('crawler')\n")); err != nil {
t.Fatalf("write part: %v", err)
}
if err := mw.Close(); err != nil {
t.Fatalf("close multipart: %v", err)
}
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/import-file", &body)
req.Header.Set("Content-Type", mw.FormDataContentType())
rr := httptest.NewRecorder()
(&AdminServer{LocalPreviewDir: filepath.Join(tmp, "previews")}).handleImportCrawlerScriptFile(rr, req)
if rr.Code != http.StatusBadRequest {
t.Fatalf("status = %d, want 400; body = %s", rr.Code, rr.Body.String())
}
if !strings.Contains(rr.Body.String(), ".py") {
t.Fatalf("body = %s, want .py error", rr.Body.String())
}
}
func TestHandleImportCrawlerScriptURL(t *testing.T) {
tmp := t.TempDir()
upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/crawler.py" {
http.NotFound(w, r)
return
}
_, _ = w.Write([]byte("# crawler from url\n"))
}))
defer upstream.Close()
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/import-url", strings.NewReader(`{
"url": "`+upstream.URL+`/crawler.py"
}`))
rr := httptest.NewRecorder()
(&AdminServer{LocalPreviewDir: filepath.Join(tmp, "previews")}).handleImportCrawlerScriptURL(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
}
var got struct {
ScriptPath string `json:"scriptPath"`
}
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
t.Fatalf("decode: %v", err)
}
wantRoot := filepath.Join(tmp, "crawler-scripts")
if !strings.HasPrefix(got.ScriptPath, wantRoot+string(os.PathSeparator)) {
t.Fatalf("script path = %q, want under %q", got.ScriptPath, wantRoot)
}
data, err := os.ReadFile(got.ScriptPath)
if err != nil {
t.Fatalf("read imported script: %v", err)
}
if string(data) != "# crawler from url\n" {
t.Fatalf("script content = %q", string(data))
}
}
func TestHandleImportCrawlerScriptURLRejectsNonPython(t *testing.T) {
tmp := t.TempDir()
upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/crawler.txt" {
http.NotFound(w, r)
return
}
_, _ = w.Write([]byte("# crawler from url\n"))
}))
defer upstream.Close()
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/import-url", strings.NewReader(`{
"url": "`+upstream.URL+`/crawler.txt"
}`))
rr := httptest.NewRecorder()
(&AdminServer{LocalPreviewDir: filepath.Join(tmp, "previews")}).handleImportCrawlerScriptURL(rr, req)
if rr.Code != http.StatusBadRequest {
t.Fatalf("status = %d, want 400; body = %s", rr.Code, rr.Body.String())
}
if !strings.Contains(rr.Body.String(), ".py") {
t.Fatalf("body = %s, want .py error", rr.Body.String())
}
}
+13 -1
View File
@@ -706,7 +706,19 @@ func (c *Catalog) ListVideoFileIDsByDrive(ctx context.Context, driveID string) (
// 用途:crawler 把这个集合写到 seen 文件,让 Python/Go 跳过已爬过的视频,
// 配合 --target-new 真正凑出 N 个未爬过的视频。
func (c *Catalog) ListSpider91Viewkeys(ctx context.Context, driveID string) ([]string, error) {
prefix := "spider91-" + driveID + "-"
return c.ListCrawlerSourceIDs(ctx, "spider91", driveID)
}
// ListCrawlerSourceIDs lists source IDs that were already imported by a
// crawler-like drive. It reads both videos and deleted_videos so explicit admin
// deletions remain tombstoned for future crawler runs.
func (c *Catalog) ListCrawlerSourceIDs(ctx context.Context, kind, driveID string) ([]string, error) {
kind = strings.TrimSpace(kind)
driveID = strings.TrimSpace(driveID)
if kind == "" || driveID == "" {
return nil, nil
}
prefix := kind + "-" + driveID + "-"
rows, err := c.db.QueryContext(ctx,
`SELECT SUBSTR(id, ?) FROM videos WHERE id LIKE ? || '%'
UNION
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,268 @@
package scriptcrawler
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"github.com/video-site/backend/internal/catalog"
)
func TestCrawlerRunOnceImportsLocalFileAndSkipsExisting(t *testing.T) {
ctx := context.Background()
tmp := t.TempDir()
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
if err != nil {
t.Fatalf("open catalog: %v", err)
}
t.Cleanup(func() {
if err := cat.Close(); err != nil {
t.Fatalf("close catalog: %v", err)
}
})
drv := New(Config{ID: "demo", RootDir: filepath.Join(tmp, "crawler")})
if err := drv.Init(ctx); err != nil {
t.Fatalf("driver init: %v", err)
}
dummyScript := filepath.Join(tmp, "helper-script")
if err := os.WriteFile(dummyScript, []byte("helper"), 0o755); err != nil {
t.Fatalf("write dummy script: %v", err)
}
wrapper := filepath.Join(tmp, "helper-wrapper.sh")
wrapperScript := fmt.Sprintf("#!/bin/sh\nexec %q -test.run=TestScriptCrawlerHelperProcess \"$@\"\n", os.Args[0])
if err := os.WriteFile(wrapper, []byte(wrapperScript), 0o755); err != nil {
t.Fatalf("write helper wrapper: %v", err)
}
t.Setenv("GO_WANT_SCRIPTCRAWLER_HELPER", "1")
c := NewCrawler(CrawlerConfig{
Driver: drv,
Catalog: cat,
PythonPath: wrapper,
ScriptPath: dummyScript,
})
res, err := c.RunOnce(ctx, 1)
if err != nil {
t.Fatalf("run once: %v", err)
}
if res.NewVideos != 1 || res.Skipped != 0 || res.Failed != 0 {
t.Fatalf("result = new:%d skipped:%d failed:%d, want 1/0/0", res.NewVideos, res.Skipped, res.Failed)
}
v, err := cat.GetVideo(ctx, BuildVideoID("demo", "abc-123"))
if err != nil {
t.Fatalf("get video: %v", err)
}
if v.Title != "Imported From Helper" || v.FileID != "abc-123.mp4" || v.Size == 0 {
t.Fatalf("video = title:%q file:%q size:%d", v.Title, v.FileID, v.Size)
}
if _, err := os.Stat(filepath.Join(drv.VideosDir(), "abc-123.mp4")); err != nil {
t.Fatalf("video file not copied: %v", err)
}
res, err = c.RunOnce(ctx, 1)
if err != nil {
t.Fatalf("second run: %v", err)
}
if res.NewVideos != 0 || res.Skipped != 1 {
t.Fatalf("second result = new:%d skipped:%d, want 0/1", res.NewVideos, res.Skipped)
}
if res.SeenSnapshot != 1 {
t.Fatalf("seen snapshot = %d, want 1", res.SeenSnapshot)
}
}
func TestCrawlerRunOnceUsesSourceKindNamespace(t *testing.T) {
ctx := context.Background()
tmp := t.TempDir()
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
if err != nil {
t.Fatalf("open catalog: %v", err)
}
t.Cleanup(func() {
if err := cat.Close(); err != nil {
t.Fatalf("close catalog: %v", err)
}
})
drv := New(Config{ID: "demo", RootDir: filepath.Join(tmp, "crawler")})
if err := drv.Init(ctx); err != nil {
t.Fatalf("driver init: %v", err)
}
dummyScript := filepath.Join(tmp, "helper-script")
if err := os.WriteFile(dummyScript, []byte("helper"), 0o755); err != nil {
t.Fatalf("write dummy script: %v", err)
}
wrapper := filepath.Join(tmp, "helper-wrapper.sh")
wrapperScript := fmt.Sprintf("#!/bin/sh\nexec %q -test.run=TestScriptCrawlerHelperProcess \"$@\"\n", os.Args[0])
if err := os.WriteFile(wrapper, []byte(wrapperScript), 0o755); err != nil {
t.Fatalf("write helper wrapper: %v", err)
}
t.Setenv("GO_WANT_SCRIPTCRAWLER_HELPER", "1")
c := NewCrawler(CrawlerConfig{
Driver: drv,
Catalog: cat,
SourceKind: "spider91",
PythonPath: wrapper,
ScriptPath: dummyScript,
})
res, err := c.RunOnce(ctx, 1)
if err != nil {
t.Fatalf("run once: %v", err)
}
if res.NewVideos != 1 || res.SeenSnapshot != 0 {
t.Fatalf("result = new:%d seen:%d, want 1/0", res.NewVideos, res.SeenSnapshot)
}
videoID := BuildVideoIDForKind("spider91", "demo", "abc-123")
if _, err := cat.GetVideo(ctx, videoID); err != nil {
t.Fatalf("get source-kind video: %v", err)
}
if _, err := cat.GetVideo(ctx, BuildVideoID("demo", "abc-123")); err == nil {
t.Fatalf("default namespace video unexpectedly exists")
}
res, err = c.RunOnce(ctx, 1)
if err != nil {
t.Fatalf("second run: %v", err)
}
if res.NewVideos != 0 || res.Skipped != 1 || res.SeenSnapshot != 1 {
t.Fatalf("second result = new:%d skipped:%d seen:%d, want 0/1/1", res.NewVideos, res.Skipped, res.SeenSnapshot)
}
}
func TestCrawlerRunOnceImportsSimpleMediaURLWithoutSourceID(t *testing.T) {
ctx := context.Background()
tmp := t.TempDir()
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
if err != nil {
t.Fatalf("open catalog: %v", err)
}
t.Cleanup(func() {
if err := cat.Close(); err != nil {
t.Fatalf("close catalog: %v", err)
}
})
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/video.mp4" {
http.NotFound(w, r)
return
}
_, _ = w.Write([]byte("simple-video-bytes"))
}))
defer srv.Close()
drv := New(Config{ID: "demo", RootDir: filepath.Join(tmp, "crawler")})
if err := drv.Init(ctx); err != nil {
t.Fatalf("driver init: %v", err)
}
dummyScript := filepath.Join(tmp, "helper-script")
if err := os.WriteFile(dummyScript, []byte("helper"), 0o755); err != nil {
t.Fatalf("write dummy script: %v", err)
}
wrapper := filepath.Join(tmp, "helper-wrapper.sh")
wrapperScript := fmt.Sprintf("#!/bin/sh\nexec %q -test.run=TestScriptCrawlerHelperProcess \"$@\"\n", os.Args[0])
if err := os.WriteFile(wrapper, []byte(wrapperScript), 0o755); err != nil {
t.Fatalf("write helper wrapper: %v", err)
}
t.Setenv("GO_WANT_SCRIPTCRAWLER_HELPER", "1")
t.Setenv("GO_WANT_SCRIPTCRAWLER_SIMPLE", "1")
t.Setenv("GO_SCRIPTCRAWLER_MEDIA_URL", srv.URL+"/video.mp4?token=first")
c := NewCrawler(CrawlerConfig{
Driver: drv,
Catalog: cat,
PythonPath: wrapper,
ScriptPath: dummyScript,
HTTPClient: srv.Client(),
})
res, err := c.RunOnce(ctx, 1)
if err != nil {
t.Fatalf("run once: %v", err)
}
if res.NewVideos != 1 || res.Skipped != 0 || res.Failed != 0 {
t.Fatalf("result = new:%d skipped:%d failed:%d, want 1/0/0", res.NewVideos, res.Skipped, res.Failed)
}
videos, err := cat.ListVideosByDrive(ctx, "demo")
if err != nil {
t.Fatalf("list videos: %v", err)
}
if len(videos) != 1 {
t.Fatalf("videos = %d, want 1", len(videos))
}
v := videos[0]
if !strings.HasPrefix(v.ID, BuildVideoID("demo", "auto-")) {
t.Fatalf("video id = %q, want generated auto source id", v.ID)
}
if v.Title != "Simple Protocol Video" || v.Ext != "mp4" || v.ThumbnailURL != "" || v.Size == 0 {
t.Fatalf("video = title:%q ext:%q thumb:%q size:%d", v.Title, v.Ext, v.ThumbnailURL, v.Size)
}
if _, err := os.Stat(filepath.Join(drv.VideosDir(), v.FileID)); err != nil {
t.Fatalf("video file not downloaded: %v", err)
}
t.Setenv("GO_SCRIPTCRAWLER_MEDIA_URL", srv.URL+"/video.mp4?token=second")
res, err = c.RunOnce(ctx, 1)
if err != nil {
t.Fatalf("second run: %v", err)
}
if res.NewVideos != 0 || res.Skipped != 1 {
t.Fatalf("second result = new:%d skipped:%d, want 0/1", res.NewVideos, res.Skipped)
}
}
func TestScriptCrawlerHelperProcess(t *testing.T) {
if os.Getenv("GO_WANT_SCRIPTCRAWLER_HELPER") != "1" {
return
}
args := os.Args
jobPath := ""
for i := 0; i < len(args)-1; i++ {
if args[i] == "--job" {
jobPath = args[i+1]
break
}
}
if jobPath == "" {
fmt.Fprintln(os.Stderr, "missing --job")
os.Exit(2)
}
data, err := os.ReadFile(jobPath)
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(2)
}
var job Job
if err := json.Unmarshal(data, &job); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(2)
}
if os.Getenv("GO_WANT_SCRIPTCRAWLER_SIMPLE") == "1" {
event := map[string]any{
"title": "Simple Protocol Video",
"media_url": os.Getenv("GO_SCRIPTCRAWLER_MEDIA_URL"),
}
_ = json.NewEncoder(os.Stdout).Encode(event)
os.Exit(0)
}
localFile := filepath.Join(job.OutputDir, "helper.mp4")
if err := os.WriteFile(localFile, []byte("helper-video"), 0o644); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(2)
}
event := Event{
Type: "item",
Item: Item{
SourceID: "abc-123",
Title: "Imported From Helper",
Author: "helper",
Media: MediaRef{LocalFile: localFile},
},
}
_ = json.NewEncoder(os.Stdout).Encode(event)
os.Exit(0)
}
@@ -0,0 +1,172 @@
// Package scriptcrawler provides a generic local drive for script-based
// crawlers. A crawler script discovers videos; the Go runner downloads them
// into this drive and the existing preview/fingerprint workers consume them
// through the normal drives.Drive interface.
package scriptcrawler
import (
"context"
"errors"
"io"
"os"
"path/filepath"
"strings"
"time"
"github.com/video-site/backend/internal/drives"
)
const Kind = "scriptcrawler"
type Config struct {
ID string
RootDir string
}
type Driver struct {
id string
rootDir string
}
func New(c Config) *Driver {
return &Driver{id: c.ID, rootDir: c.RootDir}
}
func (d *Driver) Kind() string { return Kind }
func (d *Driver) ID() string { return d.id }
func (d *Driver) RootID() string { return "/" }
func (d *Driver) Init(context.Context) error {
if strings.TrimSpace(d.id) == "" {
return errors.New("scriptcrawler: empty drive id")
}
if strings.TrimSpace(d.rootDir) == "" {
return errors.New("scriptcrawler: empty root dir")
}
for _, sub := range []string{"videos", "thumbs", "output", ".crawl"} {
if err := os.MkdirAll(filepath.Join(d.rootDir, sub), 0o755); err != nil {
return err
}
}
return nil
}
func (d *Driver) RootDir() string { return d.rootDir }
func (d *Driver) VideosDir() string { return filepath.Join(d.rootDir, "videos") }
func (d *Driver) ThumbsDir() string { return filepath.Join(d.rootDir, "thumbs") }
func (d *Driver) OutputDir() string { return filepath.Join(d.rootDir, "output") }
func (d *Driver) CrawlDir() string { return filepath.Join(d.rootDir, ".crawl") }
func (d *Driver) VideoPath(fileID string) (string, error) {
return safeJoin(d.VideosDir(), fileID)
}
func (d *Driver) ThumbPath(fileID string) (string, error) {
return safeJoin(d.ThumbsDir(), fileID)
}
func (d *Driver) OutputPath(fileName string) (string, error) {
return safeJoin(d.OutputDir(), fileName)
}
func (d *Driver) List(context.Context, string) ([]drives.Entry, error) {
entries, err := os.ReadDir(d.VideosDir())
if err != nil {
if os.IsNotExist(err) {
return nil, nil
}
return nil, err
}
out := make([]drives.Entry, 0, len(entries))
for _, e := range entries {
if e.IsDir() {
continue
}
info, err := e.Info()
if err != nil {
continue
}
out = append(out, drives.Entry{
ID: e.Name(),
Name: e.Name(),
Size: info.Size(),
IsDir: false,
ModTime: info.ModTime(),
})
}
return out, nil
}
func (d *Driver) Stat(ctx context.Context, fileID string) (*drives.Entry, error) {
path, err := d.VideoPath(fileID)
if err != nil {
return nil, err
}
info, err := os.Stat(path)
if err != nil {
return nil, err
}
return &drives.Entry{
ID: fileID,
Name: fileID,
Size: info.Size(),
IsDir: info.IsDir(),
ModTime: info.ModTime(),
}, nil
}
func (d *Driver) StreamURL(ctx context.Context, fileID string) (*drives.StreamLink, error) {
path, err := d.VideoPath(fileID)
if err != nil {
return nil, err
}
info, err := os.Stat(path)
if err != nil {
return nil, err
}
if info.IsDir() || info.Size() == 0 {
return nil, os.ErrNotExist
}
return &drives.StreamLink{
URL: path,
Expires: time.Now().Add(24 * time.Hour),
}, nil
}
func (d *Driver) Upload(context.Context, string, string, io.Reader, int64) (string, error) {
return "", drives.ErrNotSupported
}
func (d *Driver) EnsureDir(context.Context, string) (string, error) {
return "", drives.ErrNotSupported
}
func safeJoin(root, fileID string) (string, error) {
id := strings.TrimSpace(fileID)
if id == "" || filepath.Base(id) != id {
return "", errors.New("scriptcrawler: invalid file id")
}
if strings.TrimSpace(root) == "" {
return "", errors.New("scriptcrawler: empty root")
}
rootAbs, err := filepath.Abs(root)
if err != nil {
return "", err
}
pathAbs, err := filepath.Abs(filepath.Join(rootAbs, id))
if err != nil {
return "", err
}
if pathAbs != rootAbs && !strings.HasPrefix(pathAbs, rootAbs+string(os.PathSeparator)) {
return "", errors.New("scriptcrawler: file id escapes root")
}
return pathAbs, nil
}
var _ drives.Drive = (*Driver)(nil)
+45 -13
View File
@@ -34,6 +34,7 @@ import (
"github.com/video-site/backend/internal/drives/p115"
"github.com/video-site/backend/internal/drives/p123"
"github.com/video-site/backend/internal/drives/pikpak"
"github.com/video-site/backend/internal/drives/scriptcrawler"
"github.com/video-site/backend/internal/drives/spider91"
"github.com/video-site/backend/internal/mediaasset"
)
@@ -58,6 +59,17 @@ type uploadTarget interface {
Rename(ctx context.Context, fileID, newName string) error
}
// Spider91LocalSource is the local source interface used by the migration
// worker. Legacy spider91.Driver and the new scriptcrawler.Driver both satisfy
// it when they are mounted for the Spider91 built-in crawler.
type Spider91LocalSource interface {
drives.Drive
VideosDir() string
ThumbsDir() string
VideoPath(fileID string) (string, error)
ThumbPath(fileID string) (string, error)
}
// UploadResult 是 uploadTarget.UploadAndReportHash 的归一返回。
//
// FileID 目标盘上的新文件 ID;
@@ -364,7 +376,7 @@ func (m *Migrator) runOnce(ctx context.Context) {
}
migrated := 0
for _, src := range m.spider91Drives() {
for _, src := range m.spider91Drives(ctx) {
if err := ctx.Err(); err != nil {
return
}
@@ -387,7 +399,7 @@ func (m *Migrator) runOnce(ctx context.Context) {
// 收尾:扫每个 spider91 drive 的本地目录,把 catalog 已经迁到别处但本地
// 仍有残留的孤儿文件清掉。这是纯防御性兜底——正常路径下 migrateDrive
// 已经在迁移成功后立刻 CleanupSpider91Local,不会留孤儿。
for _, src := range m.spider91Drives() {
for _, src := range m.spider91Drives(ctx) {
if err := ctx.Err(); err != nil {
return
}
@@ -448,21 +460,41 @@ func (m *Migrator) resolveTarget() (string, uploadTarget, error) {
return id, t, nil
}
// spider91Drives 返回当前注册的所有 spider91 driver。
func (m *Migrator) spider91Drives() []*spider91.Driver {
// spider91Drives 返回当前注册的所有 Spider91 来源本地爬虫 driver。
func (m *Migrator) spider91Drives(ctx context.Context) []Spider91LocalSource {
all := m.cfg.Registry.All()
out := make([]*spider91.Driver, 0, len(all))
out := make([]Spider91LocalSource, 0, len(all))
for _, d := range all {
if d.Kind() != spider91.Kind {
if !m.isSpider91SourceDrive(ctx, d) {
continue
}
if sd, ok := d.(*spider91.Driver); ok {
if sd, ok := d.(Spider91LocalSource); ok {
out = append(out, sd)
}
}
return out
}
func (m *Migrator) isSpider91SourceDrive(ctx context.Context, d drives.Drive) bool {
if d == nil {
return false
}
if d.Kind() == spider91.Kind {
return true
}
if d.Kind() != scriptcrawler.Kind || m.cfg.Catalog == nil {
return false
}
row, err := m.cfg.Catalog.GetDrive(ctx, d.ID())
if err != nil || row == nil {
return false
}
if row.Kind == spider91.Kind {
return true
}
return row.Kind == scriptcrawler.Kind && strings.EqualFold(strings.TrimSpace(row.Credentials["builtin"]), spider91.Kind)
}
// migrateDrive 对单个 spider91 drive 跑一批迁移;返回成功迁移的条数。
//
// 策略(与"本地缓存最新 N 个"语义一致):
@@ -473,7 +505,7 @@ func (m *Migrator) spider91Drives() []*spider91.Driver {
// - 已经迁移过但本地还有残留 → 仅删本地(兜底)
//
// KeepLatestN < 0 时不保护任何本地文件,全部尝试迁移(旧行为,主要给测试用)。
func (m *Migrator) migrateDrive(ctx context.Context, src *spider91.Driver, targetDriveID string, pp uploadTarget) (int, error) {
func (m *Migrator) migrateDrive(ctx context.Context, src Spider91LocalSource, targetDriveID string, pp uploadTarget) (int, error) {
keepN := m.cfg.KeepLatestN
if keepN < 0 {
keepN = 0
@@ -574,7 +606,7 @@ func (m *Migrator) migrateDrive(ctx context.Context, src *spider91.Driver, targe
// migrateOne 把单条 spider91 视频上传到目标盘并改写 catalog。
// 返回 (true, nil) 表示真的迁了一条;(false, nil) 表示跳过(本地文件已不在等);
// (false, err) 表示真出错。
func (m *Migrator) migrateOne(ctx context.Context, v *catalog.Video, src *spider91.Driver, targetDriveID string, pp uploadTarget) (bool, error) {
func (m *Migrator) migrateOne(ctx context.Context, v *catalog.Video, src Spider91LocalSource, targetDriveID string, pp uploadTarget) (bool, error) {
path, err := src.VideoPath(v.FileID)
if err != nil {
return false, fmt.Errorf("resolve local path: %w", err)
@@ -637,7 +669,7 @@ func (m *Migrator) migrateOne(ctx context.Context, v *catalog.Video, src *spider
return true, nil
}
func (m *Migrator) preserveCrawledThumbnail(ctx context.Context, src *spider91.Driver, v *catalog.Video) {
func (m *Migrator) preserveCrawledThumbnail(ctx context.Context, src Spider91LocalSource, v *catalog.Video) {
if m == nil || m.cfg.Catalog == nil || src == nil || v == nil || v.ID == "" || v.FileID == "" {
return
}
@@ -676,7 +708,7 @@ func (m *Migrator) preserveCrawledThumbnail(ctx context.Context, src *spider91.D
v.ThumbnailURL = "/p/thumb/" + v.ID
}
func findSpider91ThumbPath(src *spider91.Driver, fileID string) (string, bool) {
func findSpider91ThumbPath(src Spider91LocalSource, fileID string) (string, bool) {
thumbBase := stripExt(fileID)
for _, ext := range []string{".jpg", ".jpeg", ".png", ".webp"} {
thumbPath, err := src.ThumbPath(thumbBase + ext)
@@ -722,7 +754,7 @@ func copyFileAtomic(src, dst string) error {
// 我们不知道具体是 .jpg 还是别的,逐个尝试常见后缀)。
//
// 暴露成包级函数方便 cleanup 模块复用(任务 6)。
func CleanupSpider91Local(src *spider91.Driver, fileID string) {
func CleanupSpider91Local(src Spider91LocalSource, fileID string) {
videoPath, err := src.VideoPath(fileID)
if err == nil {
if err := os.Remove(videoPath); err != nil && !os.IsNotExist(err) {
@@ -759,7 +791,7 @@ func stripExt(name string) string {
// 找到孤儿。
//
// 返回实际删除的文件个数。
func (m *Migrator) cleanupOldLocalVideos(ctx context.Context, src *spider91.Driver) (int, error) {
func (m *Migrator) cleanupOldLocalVideos(ctx context.Context, src Spider91LocalSource) (int, error) {
entries, err := os.ReadDir(src.VideosDir())
if err != nil {
if os.IsNotExist(err) {
@@ -17,6 +17,7 @@ import (
"github.com/video-site/backend/internal/drives/googledrive"
"github.com/video-site/backend/internal/drives/p123"
"github.com/video-site/backend/internal/drives/pikpak"
"github.com/video-site/backend/internal/drives/scriptcrawler"
"github.com/video-site/backend/internal/drives/spider91"
)
@@ -599,6 +600,88 @@ func TestCleanupRemovesAllAlreadyMigratedOrphans(t *testing.T) {
}
}
func TestRunOnceMigratesBuiltInSpider91ScriptCrawlerSource(t *testing.T) {
ctx := context.Background()
cat := setupCatalog(t)
src := scriptcrawler.New(scriptcrawler.Config{ID: "spider-script", RootDir: t.TempDir()})
if err := src.Init(ctx); err != nil {
t.Fatalf("scriptcrawler init: %v", err)
}
if err := cat.UpsertDrive(ctx, &catalog.Drive{
ID: src.ID(),
Kind: scriptcrawler.Kind,
Name: "Built-in Spider91",
Credentials: map[string]string{"builtin": "spider91"},
}); err != nil {
t.Fatalf("upsert source drive: %v", err)
}
pp := newFakePikPak("pikpak-target", "pikpak-root-id")
reg := newFakeRegistry()
reg.Add(src)
reg.Add(pp)
fileID := "vk-script.mp4"
videoPath, err := src.VideoPath(fileID)
if err != nil {
t.Fatalf("video path: %v", err)
}
if err := os.WriteFile(videoPath, []byte("scriptcrawler spider91 video"), 0o644); err != nil {
t.Fatalf("write video: %v", err)
}
thumbPath, err := src.ThumbPath("vk-script.jpg")
if err != nil {
t.Fatalf("thumb path: %v", err)
}
if err := os.WriteFile(thumbPath, []byte("thumb"), 0o644); err != nil {
t.Fatalf("write thumb: %v", err)
}
now := time.Now()
id := "spider91-" + src.ID() + "-vk-script"
if err := cat.UpsertVideo(ctx, &catalog.Video{
ID: id,
DriveID: src.ID(),
FileID: fileID,
FileName: fileID,
Title: "Scriptcrawler Spider91",
Author: "91porn",
Ext: "mp4",
Quality: "HD",
Size: int64(len("scriptcrawler spider91 video")),
PreviewStatus: "pending",
PublishedAt: now,
CreatedAt: now,
UpdatedAt: now,
}); err != nil {
t.Fatalf("upsert video: %v", err)
}
m := New(Config{
Catalog: cat,
Registry: reg,
GetTargetDriveID: func() string { return pp.ID() },
KeepLatestN: -1,
CommonThumbDir: t.TempDir(),
})
m.runOnce(ctx)
if pp.uploadCalls != 1 {
t.Fatalf("upload calls = %d, want 1", pp.uploadCalls)
}
got, err := cat.GetVideo(ctx, id)
if err != nil {
t.Fatalf("get migrated video: %v", err)
}
if got.DriveID != pp.ID() {
t.Fatalf("drive_id = %q, want %q", got.DriveID, pp.ID())
}
if _, err := os.Stat(videoPath); !os.IsNotExist(err) {
t.Fatalf("local video stat err = %v, want not exist", err)
}
if _, err := os.Stat(thumbPath); !os.IsNotExist(err) {
t.Fatalf("local thumb stat err = %v, want not exist", err)
}
}
// TestRunOnceKeepsAllLocalWhenWithinKeepWindow 验证:本地文件数 ≤ KeepLatestN 时
// 一律不上传,全部留作"最新 N"缓存。这是用户的核心需求:刚爬下来的 15 个不要立即被传走。
func TestRunOnceKeepsAllLocalWhenWithinKeepWindow(t *testing.T) {
+113
View File
@@ -0,0 +1,113 @@
# Crawler Script Protocol v1
Crawler scripts are external processes. The Go backend is the host: it handles
dedupe, downloading, catalog writes, thumbnails, preview videos, fingerprints,
task status and cancellation.
## Invocation
The backend runs:
```bash
python3 /path/to/crawler.py --job /path/to/job.json
```
`job.json`:
```json
{
"protocol": "crawler.v1",
"mode": "crawl",
"run_id": "20260609T120000Z",
"crawler_id": "example",
"target_new": 10,
"seen_source_ids_file": "/data/scriptcrawlers/example/.crawl/seen.txt",
"output_dir": "/data/scriptcrawlers/example/output",
"config": {
"category": "hot"
},
"network": {
"proxy_url": "http://127.0.0.1:7890"
}
}
```
## Importing Scripts
Crawler scripts are configured from the admin crawler page. A script can be
entered as an existing server path, uploaded as a local file, or imported from
an HTTP(S) URL.
Imported scripts are copied into `crawler-scripts/` next to the configured local
preview data directory. The import API currently accepts Python files only
(`.py`) and rejects empty files or files larger than 2 MiB.
## Output
stdout must be JSON Lines. Logs must go to stderr.
Recommended item event:
```json
{
"type": "item",
"title": "Video title",
"media_url": "https://cdn.example.test/video.mp4",
"thumbnail_url": "https://cdn.example.test/cover.jpg",
"source_id": "site-native-id",
"headers": {
"Referer": "https://example.test/"
}
}
```
Minimum item event:
```json
{"type":"item","title":"Video title","media_url":"https://cdn.example.test/video.mp4"}
```
If a line contains item fields such as `title` and `media_url`, the backend also
treats it as an item when `type` is omitted.
The item fields may also be wrapped inside `"item"` if that is more convenient:
```json
{"type":"item","item":{"title":"Video title","media_url":"https://cdn.example.test/video.mp4"}}
```
Optional progress/done events:
```json
{"type":"progress","checked":20,"emitted":3}
{"type":"done","stats":{"emitted":10}}
```
## Simple Field Rules
- `title` is required.
- `media_url` is required for normal scripts. The backend downloads the video.
- `thumbnail_url` is optional. If it is empty, the backend generates a thumbnail
from the downloaded video.
- `source_id` is optional but recommended. If present, it should be stable
within one crawler and lets the backend skip known videos before downloading.
If it is empty, the backend creates an internal `auto-...` ID and later relies
on the existing video fingerprint dedupe path.
- `headers` is optional and is applied to both video and thumbnail downloads.
Use it for `Referer`, cookies or anti-hotlinking requirements.
## Advanced Fields
- `detail_url`, `author`, `tags`, `category`, `quality`, `duration_seconds`,
`description` and `published_at` are optional metadata fields.
- If video and thumbnail need different headers, use `media_headers` and
`thumbnail_headers`.
- Existing nested fields are still supported for compatibility:
`media.url`, `media.local_file`, `media.headers`, `thumbnail.url`,
`thumbnail.local_file`, `thumbnail.headers`.
- Advanced scripts may download into `job.output_dir` and return
`media_local_file` or `media.local_file`. The path must stay inside
`output_dir`.
- Scripts can read `seen_source_ids_file` and skip known IDs when they provide
stable `source_id` values. The backend still dedupes every item.
- The backend stops the process after `target_new` new videos are imported.
+48
View File
@@ -0,0 +1,48 @@
#!/usr/bin/env python3
import argparse
import json
import sys
def load_seen(path):
try:
with open(path, "r", encoding="utf-8") as f:
return {line.strip() for line in f if line.strip()}
except FileNotFoundError:
return set()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--job", required=True)
args = parser.parse_args()
with open(args.job, "r", encoding="utf-8") as f:
job = json.load(f)
seen = load_seen(job.get("seen_source_ids_file", ""))
source_id = "demo-video-1"
if source_id in seen:
print(json.dumps({"type": "done", "stats": {"emitted": 0}}), flush=True)
return
event = {
"type": "item",
"source_id": source_id,
"title": "Demo Video",
"media_url": "https://example.test/video/demo-video-1.mp4",
"thumbnail_url": "https://example.test/thumb/demo-video-1.jpg",
"headers": {
"Referer": "https://example.test/",
},
}
print(json.dumps(event, ensure_ascii=False), flush=True)
print(json.dumps({"type": "done", "stats": {"emitted": 1}}), flush=True)
if __name__ == "__main__":
try:
main()
except Exception as exc:
print(f"crawler failed: {exc}", file=sys.stderr, flush=True)
raise
+2
View File
@@ -8,6 +8,7 @@ import { AdminLayout } from "@/admin/AdminLayout";
import { LoginPage } from "@/admin/LoginPage";
import { RequireAuth } from "@/admin/RequireAuth";
import { DrivesPage } from "@/admin/DrivesPage";
import { CrawlersPage } from "@/admin/CrawlersPage";
import { VideosPage } from "@/admin/VideosPage";
import { TagsPage } from "@/admin/TagsPage";
import { ThemePage } from "@/admin/ThemePage";
@@ -70,6 +71,7 @@ export default function App() {
>
<Route index element={<Navigate to="/admin/drives" replace />} />
<Route path="drives" element={<DrivesPage />} />
<Route path="crawlers" element={<CrawlersPage />} />
<Route path="videos" element={<VideosPage />} />
<Route path="tags" element={<TagsPage />} />
<Route path="theme" element={<ThemePage />} />
+9
View File
@@ -14,6 +14,7 @@ import {
import * as api from "./api";
import { useAuth } from "./AuthContext";
import { useToast } from "./ToastContext";
import { SpiderIcon } from "./icons/SpiderIcon";
export function AdminLayout() {
const { logout } = useAuth();
@@ -88,6 +89,14 @@ export function AdminLayout() {
>
<HardDrive size={16} />
</NavLink>
<NavLink
to="/admin/crawlers"
className={({ isActive }) =>
`admin-nav__link ${isActive ? "is-active" : ""}`
}
>
<SpiderIcon size={16} />
</NavLink>
<NavLink
to="/admin/videos"
className={({ isActive }) =>
+402
View File
@@ -0,0 +1,402 @@
import { useEffect, useMemo, useState } from "react";
import { ArrowLeft, CircleStop, Download, Link as LinkIcon, Plus, Save, Trash2, Upload } from "lucide-react";
import * as api from "./api";
import { useToast } from "./ToastContext";
import { driveKindAbbr, generationStateClass, generationStateLabel } from "./drive/constants";
import { SpiderIcon } from "./icons/SpiderIcon";
type CrawlerForm = {
id: string;
name: string;
builtin: string;
scriptPath: string;
pythonPath: string;
targetNew: string;
proxy: string;
configJson: string;
};
const emptyForm: CrawlerForm = {
id: "",
name: "",
builtin: "",
scriptPath: "",
pythonPath: "python3",
targetNew: "10",
proxy: "",
configJson: "",
};
export function CrawlersPage() {
const [list, setList] = useState<api.AdminCrawler[]>([]);
const [selectedId, setSelectedId] = useState("");
const [form, setForm] = useState<CrawlerForm>(emptyForm);
const [loading, setLoading] = useState(true);
const [saving, setSaving] = useState(false);
const [runningId, setRunningId] = useState("");
const [stoppingId, setStoppingId] = useState("");
const [scriptURL, setScriptURL] = useState("");
const [importingScript, setImportingScript] = useState(false);
const [mode, setMode] = useState<"list" | "detail">("list");
const { show } = useToast();
const selected = useMemo(
() => list.find((item) => item.id === selectedId) ?? null,
[list, selectedId]
);
async function refresh() {
setLoading(true);
try {
const data = await api.listCrawlers();
setList(data);
} catch (e) {
show(e instanceof Error ? e.message : "加载爬虫失败", "error");
} finally {
setLoading(false);
}
}
useEffect(() => {
refresh();
}, []);
function selectCrawler(crawler: api.AdminCrawler) {
setSelectedId(crawler.id);
setMode("detail");
setForm({
id: crawler.id,
name: crawler.name,
builtin: crawler.builtin ?? "",
scriptPath: crawler.scriptPath ?? "",
pythonPath: crawler.pythonPath || "python3",
targetNew: crawler.targetNew || (crawler.builtin === "spider91" || crawler.kind === "spider91" ? "15" : "10"),
proxy: crawler.proxy ?? "",
configJson: crawler.configJson ?? "",
});
}
function createCustom() {
setSelectedId("");
setForm(emptyForm);
setScriptURL("");
setMode("detail");
}
function createSpider91() {
setSelectedId("");
setForm({
...emptyForm,
id: "spider91",
name: "91 爬虫",
builtin: "spider91",
scriptPath: "",
targetNew: "15",
});
setScriptURL("");
setMode("detail");
}
function backToList() {
setSelectedId("");
setForm(emptyForm);
setScriptURL("");
setMode("list");
}
function set<K extends keyof CrawlerForm>(key: K, value: CrawlerForm[K]) {
setForm((prev) => ({ ...prev, [key]: value }));
}
async function save() {
const id = form.id.trim();
const name = form.name.trim();
if (!id || !name) {
show("请填写爬虫 ID 和名称", "error");
return;
}
if (!form.builtin && !form.scriptPath.trim()) {
show("请先导入爬虫脚本", "error");
return;
}
setSaving(true);
try {
const resp = await api.upsertCrawler({
id,
name,
builtin: form.builtin,
scriptPath: form.scriptPath.trim(),
pythonPath: form.pythonPath.trim(),
targetNew: form.targetNew.trim(),
proxy: form.proxy.trim(),
configJson: form.configJson.trim(),
});
if (resp.warning) {
show(`已保存,但初始化失败:${resp.warning}`, "error");
} else {
show("已保存", "success");
}
setSelectedId(id);
await refresh();
setMode("list");
} catch (e) {
show(e instanceof Error ? e.message : "保存失败", "error");
} finally {
setSaving(false);
}
}
async function importScriptFile(file: File | null | undefined) {
if (!file) return;
setImportingScript(true);
try {
const resp = await api.importCrawlerScriptFile(file);
set("scriptPath", resp.scriptPath);
show("脚本已导入", "success");
} catch (e) {
show(e instanceof Error ? e.message : "导入失败", "error");
} finally {
setImportingScript(false);
}
}
async function importScriptURL() {
const url = scriptURL.trim();
if (!url) {
show("请填写脚本链接", "error");
return;
}
setImportingScript(true);
try {
const resp = await api.importCrawlerScriptURL(url);
set("scriptPath", resp.scriptPath);
setScriptURL("");
show("脚本已导入", "success");
} catch (e) {
show(e instanceof Error ? e.message : "导入失败", "error");
} finally {
setImportingScript(false);
}
}
async function run(crawler: api.AdminCrawler) {
setRunningId(crawler.id);
try {
const resp = await api.runCrawler(crawler.id);
if (!resp.accepted) {
show(resp.message || "当前爬虫有正在进行的任务", "info");
return;
}
show("已触发抓取任务", "success");
await refresh();
} catch (e) {
show(e instanceof Error ? e.message : "触发失败", "error");
} finally {
setRunningId("");
}
}
async function stop(crawler: api.AdminCrawler) {
setStoppingId(crawler.id);
try {
const resp = await api.stopCrawlerTasks(crawler.id);
show(resp.stopped ? "已请求停止任务" : "当前没有可停止任务", "info");
await refresh();
} catch (e) {
show(e instanceof Error ? e.message : "停止失败", "error");
} finally {
setStoppingId("");
}
}
async function remove(crawler: api.AdminCrawler) {
if (!window.confirm(`删除爬虫 ${crawler.name} 并清理它导入的视频?`)) return;
try {
const resp = await api.deleteCrawler(crawler.id);
show(`已删除,并清理 ${resp.deletedVideos ?? 0} 个视频`, "success");
setSelectedId("");
setForm(emptyForm);
setMode("list");
await refresh();
} catch (e) {
show(e instanceof Error ? e.message : "删除失败", "error");
}
}
return (
<section className="admin-page">
<header className="admin-page__header">
<div>
<h1 className="admin-page__title"></h1>
</div>
<div className="admin-detail-actions-inline">
{mode === "list" ? (
<button className="admin-btn is-primary" onClick={createCustom}>
<Plus size={14} />
</button>
) : (
<button className="admin-btn" onClick={backToList}>
<ArrowLeft size={14} />
</button>
)}
</div>
</header>
{mode === "list" ? (
<div className="admin-card admin-crawler-list">
<header className="admin-card__title">
<SpiderIcon size={16} />
</header>
{loading ? (
<div className="admin-loading">...</div>
) : list.length === 0 ? (
<div className="admin-empty"></div>
) : (
<div className="admin-drive-teasers">
{list.map((crawler) => (
<button
key={crawler.id}
type="button"
className={`admin-drive-teaser ${crawler.id === selectedId ? "is-active" : ""}`}
onClick={() => selectCrawler(crawler)}
>
<span className="admin-drive-teaser__name">
<span className="admin-drive-card__brand-icon" data-kind={crawler.builtin || crawler.kind}>
{crawler.builtin === "spider91" ? "91" : driveKindAbbr(crawler.kind)}
</span>
{crawler.name}
</span>
<span className={`admin-status is-${crawler.status === "ok" ? "ok" : crawler.status === "error" ? "error" : "pending"}`}>
{crawler.status === "ok" ? "已就绪" : crawler.status === "error" ? "错误" : "未连接"}
</span>
</button>
))}
</div>
)}
</div>
) : (
<div className="admin-crawler-detail">
<div className="admin-card">
<header className="admin-card__title">
<SpiderIcon size={16} /> {selected ? "爬虫配置" : "添加爬虫"}
</header>
<div className="admin-form">
{!selected && (
<div className="admin-crawler-presets">
<button className={`admin-btn ${form.builtin === "" ? "is-primary" : ""}`} type="button" onClick={createCustom}>
<Plus size={13} />
</button>
<button className={`admin-btn ${form.builtin === "spider91" ? "is-primary" : ""}`} type="button" onClick={createSpider91}>
<SpiderIcon size={13} /> 91
</button>
</div>
)}
<div className="admin-form__row">
<label htmlFor="crawler-id"> ID *</label>
<input id="crawler-id" value={form.id} onChange={(e) => set("id", e.target.value)} disabled={!!selected} />
</div>
<div className="admin-form__row">
<label htmlFor="crawler-name"> *</label>
<input id="crawler-name" value={form.name} onChange={(e) => set("name", e.target.value)} />
</div>
{!form.builtin && (
<div className="admin-form__row">
<label htmlFor="crawler-script-url"></label>
<div className="admin-crawler-import">
<input
id="crawler-script-file"
className="admin-crawler-import__file"
type="file"
accept=".py,text/x-python"
disabled={importingScript}
onChange={(e) => {
importScriptFile(e.target.files?.[0]);
e.currentTarget.value = "";
}}
/>
<label className="admin-btn" htmlFor="crawler-script-file" aria-disabled={importingScript}>
<Upload size={13} />
</label>
<input
id="crawler-script-url"
value={scriptURL}
onChange={(e) => setScriptURL(e.target.value)}
placeholder="https://example.com/crawler.py"
disabled={importingScript}
/>
<button className="admin-btn" type="button" onClick={importScriptURL} disabled={importingScript}>
<LinkIcon size={13} /> {importingScript ? "导入中..." : "链接导入"}
</button>
</div>
{form.scriptPath && <div className="admin-form__help"></div>}
</div>
)}
<div className="admin-form__row">
<label htmlFor="crawler-target"></label>
<input id="crawler-target" value={form.targetNew} onChange={(e) => set("targetNew", e.target.value)} placeholder="10" />
</div>
<div className="admin-form__row">
<label htmlFor="crawler-proxy"></label>
<input id="crawler-proxy" value={form.proxy} onChange={(e) => set("proxy", e.target.value)} placeholder="http://127.0.0.1:7890" />
</div>
<div className="admin-detail-actions">
<button className="admin-btn is-primary" onClick={save} disabled={saving}>
<Save size={13} /> {saving ? "保存中..." : "保存"}
</button>
{selected && (
<>
<button className="admin-btn" onClick={() => run(selected)} disabled={runningId === selected.id}>
<Download size={13} /> {runningId === selected.id ? "触发中..." : "立即抓取"}
</button>
<button className="admin-btn is-stop" onClick={() => stop(selected)} disabled={stoppingId === selected.id}>
<CircleStop size={13} /> {stoppingId === selected.id ? "停止中..." : "停止任务"}
</button>
<button className="admin-btn is-danger" onClick={() => remove(selected)}>
<Trash2 size={13} />
</button>
</>
)}
</div>
</div>
</div>
{selected && (
<div className="admin-card admin-crawler-status">
<header className="admin-card__title">
<Download size={16} />
</header>
<div className="admin-gen-columns">
<CrawlerStatus label="抓取" status={selected.scanGenerationStatus} />
<CrawlerStatus label="封面" status={selected.thumbnailGenerationStatus} />
<CrawlerStatus label="预览视频" status={selected.previewGenerationStatus} />
<CrawlerStatus label="视频指纹" status={selected.fingerprintGenerationStatus} />
</div>
{selected.lastError && <div className="admin-detail-error">{selected.lastError}</div>}
</div>
)}
</div>
)}
</section>
);
}
function CrawlerStatus({ label, status }: { label: string; status?: api.DriveGenerationStatus }) {
const state = status?.state || "idle";
const labelText = label === "抓取" && state === "scanning" ? "抓取中" : generationStateLabel(state);
return (
<div className="admin-gen-col">
<div className="admin-gen-col__head">
<span className="admin-gen-col__label">{label}</span>
<span className={`admin-status admin-generation-state is-${generationStateClass(state)}`}>
{labelText}
</span>
</div>
{label === "抓取" && (
<div className="admin-gen-col__counts admin-gen-col__counts--scan">
<div className="admin-gen-col__count"><span></span><strong>{status?.scannedCount ?? 0}</strong></div>
<div className="admin-gen-col__count"><span></span><strong>{status?.addedCount ?? 0}</strong></div>
</div>
)}
</div>
);
}
+18 -15
View File
@@ -325,6 +325,10 @@ export function DrivesPage() {
}
async function handleRescan(d: api.AdminDrive) {
if (d.kind === "spider91") {
show("91Spider 不再支持通过网盘运行,请到爬虫管理添加爬虫脚本", "info");
return;
}
if (nightlyBusy) {
show(nightlyBusyText(nightlyStatus) || NIGHTLY_BUSY_MESSAGE, "info");
return;
@@ -345,11 +349,7 @@ export function DrivesPage() {
refreshDriveList();
return;
}
if (d.kind === "spider91") {
show("已触发抓取任务,需要 2-4 分钟,可稍后刷新视频列表查看", "success");
} else {
show("已触发扫描,可稍后刷新视频列表查看", "success");
}
show("已触发扫描,可稍后刷新视频列表查看", "success");
refreshDriveList();
} catch (e) {
show(e instanceof Error ? e.message : "触发失败", "error");
@@ -550,10 +550,8 @@ export function DrivesPage() {
)}
{d.kind === "spider91" && (
<div className="admin-detail-row">
<span className="admin-detail-label"></span>
<span className="admin-detail-value">
{d.lastCrawlAt ? new Date(d.lastCrawlAt * 1000).toLocaleString() : "尚未抓取"}
</span>
<span className="admin-detail-label"></span>
<span className="admin-detail-value"></span>
</div>
)}
</div>
@@ -567,9 +565,12 @@ export function DrivesPage() {
type="button"
className="admin-btn is-primary"
onClick={() => handleRescan(d)}
aria-disabled={nightlyBusy || isDriveBusy(d) || !!scanningDriveIds[d.id]}
disabled={d.kind === "spider91"}
aria-disabled={d.kind === "spider91" || nightlyBusy || isDriveBusy(d) || !!scanningDriveIds[d.id]}
title={
nightlyBusy
d.kind === "spider91"
? "91Spider 不再支持通过网盘运行,请到爬虫管理添加爬虫脚本"
: nightlyBusy
? nightlyBusyText(nightlyStatus) || NIGHTLY_BUSY_MESSAGE
: isDriveBusy(d) || scanningDriveIds[d.id]
? DRIVE_BUSY_MESSAGE
@@ -579,7 +580,7 @@ export function DrivesPage() {
{d.kind === "spider91" ? (
<>
<Download size={13} className={scanningDriveIds[d.id] ? "admin-spin" : undefined} />
{scanningDriveIds[d.id] ? "触发中..." : "立即抓取"}
</>
) : (
<>
@@ -599,9 +600,11 @@ export function DrivesPage() {
{stoppingDriveId === d.id ? "停止中..." : "停止所有任务"}
</button>
</div>
<button type="button" className="admin-btn" onClick={() => openEdit(d)}>
{d.kind === "spider91" ? "编辑配置" : "编辑配置凭证"}
</button>
{d.kind !== "spider91" && (
<button type="button" className="admin-btn" onClick={() => openEdit(d)}>
</button>
)}
<button type="button" className="admin-btn is-danger admin-detail-actions__danger" onClick={() => setDeleteTarget(d)}>
<Trash2 size={13} />
</button>
+98 -4
View File
@@ -12,13 +12,14 @@ async function request<T>(
path: string,
init: RequestInit = {}
): Promise<T> {
const headers = new Headers(init.headers ?? {});
if (!(init.body instanceof FormData) && !headers.has("Content-Type")) {
headers.set("Content-Type", "application/json");
}
const res = await fetch(BASE + path, {
credentials: "include",
headers: {
"Content-Type": "application/json",
...(init.headers ?? {}),
},
...init,
headers,
});
if (res.status === 401) {
throw new UnauthorizedError();
@@ -188,6 +189,99 @@ export function stopDriveTasks(id: string) {
);
}
// ---------- Crawlers ----------
export type AdminCrawler = {
id: string;
name: string;
kind: "scriptcrawler" | "spider91";
builtin?: string;
status: string;
lastError?: string;
scriptPath: string;
pythonPath?: string;
proxy?: string;
targetNew?: string;
configJson?: string;
lastCrawlAt?: number;
scanGenerationStatus?: DriveGenerationStatus;
thumbnailGenerationStatus?: DriveGenerationStatus;
previewGenerationStatus?: DriveGenerationStatus;
fingerprintGenerationStatus?: DriveGenerationStatus;
thumbnailReadyCount: number;
thumbnailPendingCount: number;
thumbnailFailedCount: number;
teaserReadyCount: number;
teaserPendingCount: number;
teaserFailedCount: number;
fingerprintReadyCount: number;
fingerprintPendingCount: number;
fingerprintFailedCount: number;
};
export type UpsertCrawlerInput = {
id: string;
name: string;
builtin?: string;
scriptPath: string;
pythonPath?: string;
proxy?: string;
targetNew?: string;
configJson?: string;
};
export type ImportCrawlerScriptResult = {
scriptPath: string;
};
export function listCrawlers() {
return request<AdminCrawler[]>("/crawlers");
}
export function upsertCrawler(body: UpsertCrawlerInput) {
return request<{ ok: boolean; warning?: string }>("/crawlers", {
method: "POST",
body: JSON.stringify(body),
});
}
export function importCrawlerScriptFile(file: File) {
const form = new FormData();
form.append("file", file);
return request<ImportCrawlerScriptResult>("/crawlers/import-file", {
method: "POST",
body: form,
});
}
export function importCrawlerScriptURL(url: string) {
return request<ImportCrawlerScriptResult>("/crawlers/import-url", {
method: "POST",
body: JSON.stringify({ url }),
});
}
export function runCrawler(id: string) {
return request<{ ok: boolean; accepted: boolean; message?: string; status?: NightlyJobStatus }>(
`/crawlers/${encodeURIComponent(id)}/run`,
{ method: "POST" }
);
}
export function stopCrawlerTasks(id: string) {
return request<{ ok: boolean; stopped: boolean }>(
`/crawlers/${encodeURIComponent(id)}/tasks/stop`,
{ method: "POST" }
);
}
export function deleteCrawler(id: string) {
return request<{ ok: boolean; deletedVideos: number }>(`/crawlers/${encodeURIComponent(id)}`, {
method: "DELETE",
body: JSON.stringify({ deleteVideos: true }),
});
}
export type P123QRSession = {
loginUuid: string;
uniID: string;
+8 -4
View File
@@ -101,13 +101,17 @@ export function StatusTag({
error?: string;
hasCred: boolean;
}) {
if (kind === "spider91") {
return (
<span className="admin-status is-error" title={error || "请到爬虫管理添加爬虫脚本"}>
</span>
);
}
if (kind !== "spider91" && !hasCred) {
return <span className="admin-status is-pending"></span>;
}
if (status === "ok") {
if (kind === "spider91") {
return <span className="admin-status is-ok"></span>;
}
return <span className="admin-status is-ok"></span>;
}
if (status === "error")
@@ -205,7 +209,7 @@ export function DriveGenerationPanel({
<div className="admin-gen-columns">
<DriveGenCol
label={d.kind === "spider91" ? "抓取" : "扫盘"}
label={d.kind === "spider91" ? "已废弃" : "扫盘"}
status={d.scanGenerationStatus}
showCounts={false}
/>
-1
View File
@@ -26,7 +26,6 @@ const DRIVE_OPTIONS: DriveOption[] = [
{ kind: "onedrive", label: "OneDrive", abbr: "OD", desc: "302直链,微软网盘" },
{ kind: "googledrive", label: "Google Drive", abbr: "GD", desc: "服务器中转模式" },
{ kind: "localstorage", label: "本地存储", abbr: "Lo", desc: "本机文件目录" },
{ kind: "spider91", label: "91 爬虫", abbr: "91", desc: "自动抓取热门视频" },
{ kind: "quark", label: "夸克网盘", abbr: "Qk", desc: "302直链" },
{ kind: "wopan", label: "联通沃盘", abbr: "Wo", desc: "302直链" },
];
+1 -1
View File
@@ -163,7 +163,7 @@ export function credentialHelp(kind: Kind, isEdit: boolean): string {
case "localstorage":
return `填写服务器可访问的本地目录绝对路径,例如 /mnt/videos。系统会扫描该目录及子目录中的视频文件和 .strm 文件;.strm 可指向 HTTP/HTTPS 直链,或指向本地存储根目录内的真实视频路径。Docker 部署时请填写容器内路径。${note}`;
case "spider91":
return "91 爬虫会把定时抓取到的视频和封面先保存到本机,并作为一个视频来源接入站点;可按服务器网络情况单独配置代理。后续流水线会把较早的视频上传到你选择的 115 / PikPak / OneDrive 目标盘。";
return "91Spider 不再支持通过网盘添加或编辑。请到后台爬虫管理页面添加内置 91 或自定义爬虫脚本。";
default:
return "";
}
+34
View File
@@ -0,0 +1,34 @@
type SpiderIconProps = {
size?: number;
className?: string;
};
export function SpiderIcon({ size = 16, className }: SpiderIconProps) {
return (
<svg
className={className}
width={size}
height={size}
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
strokeWidth="2"
strokeLinecap="round"
strokeLinejoin="round"
aria-hidden="true"
focusable="false"
>
<path d="M12 7v3" />
<path d="M9 5.5 7.5 3" />
<path d="M15 5.5 16.5 3" />
<path d="M9 10.5 4.5 8" />
<path d="M15 10.5 19.5 8" />
<path d="M8.5 13.5 3 13" />
<path d="M15.5 13.5 21 13" />
<path d="M9 16 5 20" />
<path d="M15 16 19 20" />
<ellipse cx="12" cy="14" rx="4" ry="5" />
<circle cx="12" cy="8" r="2.5" />
</svg>
);
}
+52
View File
@@ -308,6 +308,33 @@
margin-bottom: var(--space-3);
}
.admin-crawler-layout {
display: grid;
grid-template-columns: minmax(260px, 0.8fr) minmax(360px, 1.2fr);
gap: var(--space-4);
align-items: start;
}
.admin-crawler-detail {
display: grid;
gap: var(--space-4);
}
.admin-crawler-status {
grid-column: 1 / -1;
}
.admin-crawler-list .admin-drive-teaser {
width: 100%;
}
.admin-crawler-presets {
display: flex;
flex-wrap: wrap;
gap: var(--space-2);
margin-bottom: var(--space-3);
}
/* ----- Storage summary ----- */
.admin-storage-summary {
display: grid;
@@ -446,6 +473,21 @@
align-items: center;
}
.admin-crawler-import {
display: grid;
grid-template-columns: auto minmax(180px, 1fr) auto;
gap: var(--space-2);
align-items: center;
}
.admin-crawler-import__file {
position: absolute;
width: 1px;
height: 1px;
opacity: 0;
pointer-events: none;
}
.admin-form__help {
font-size: var(--font-xs);
color: var(--text-faint);
@@ -1548,6 +1590,15 @@
width: 100%;
}
.admin-crawler-import {
grid-template-columns: 1fr;
}
.admin-crawler-import .admin-btn {
justify-content: center;
width: 100%;
}
.admin-p123-qr__body {
grid-template-columns: 1fr;
}
@@ -3332,6 +3383,7 @@
}
@media (max-width: 900px) {
.admin-crawler-layout,
.admin-tags-layout {
grid-template-columns: 1fr;
}
+63 -11
View File
@@ -10,6 +10,18 @@ const driveComponentsSource = readFileSync(
new URL("../src/admin/drive/DriveComponents.tsx", import.meta.url),
"utf8"
);
const crawlerPageSource = readFileSync(
new URL("../src/admin/CrawlersPage.tsx", import.meta.url),
"utf8"
);
const adminLayoutSource = readFileSync(
new URL("../src/admin/AdminLayout.tsx", import.meta.url),
"utf8"
);
const appSource = readFileSync(
new URL("../src/App.tsx", import.meta.url),
"utf8"
);
const spider91UploadTargetSource = readFileSync(
new URL("../src/admin/drive/Spider91UploadTargetField.tsx", import.meta.url),
"utf8"
@@ -51,14 +63,15 @@ function assertDriveTypeOption(value: string, label: string) {
);
}
test("spider91 drive form does not expose advanced crawler credentials", () => {
assert.match(combinedSource, /key: "proxy"/);
assert.match(combinedSource, /label: "代理地址(可选)"/);
assert.match(combinedSource, /支持 http:\/\/、https:\/\/、socks5:\/\/、socks5h:\/\/代理/);
assert.doesNotMatch(combinedSource, /target_new/);
assert.doesNotMatch(combinedSource, /crawl_hour/);
assert.doesNotMatch(combinedSource, /python_path/);
assert.doesNotMatch(combinedSource, /script_path/);
test("crawler sources are not selectable as storage drives", () => {
assert.ok(
!driveTypeOptions().some((option) => option.value === "spider91"),
"spider91 should not be a storage drive option"
);
assert.ok(
!driveTypeOptions().some((option) => option.value === "scriptcrawler"),
"scriptcrawler should not be a storage drive option"
);
});
test("spider91 upload target uses explicit local-save option instead of auto target", () => {
@@ -185,12 +198,45 @@ test("drive type selector keeps primary source order", () => {
{ value: "onedrive", label: "OneDrive" },
{ value: "googledrive", label: "Google Drive" },
{ value: "localstorage", label: "本地存储" },
{ value: "spider91", label: "91 爬虫" },
{ value: "quark", label: "夸克网盘" },
{ value: "wopan", label: "联通沃盘" },
]);
});
test("crawler management is a separate admin section", () => {
assert.match(adminLayoutSource, /to="\/admin\/crawlers"/);
assert.match(adminLayoutSource, /> 爬虫管理/);
assert.match(adminLayoutSource, /SpiderIcon size=\{16\} \/> 爬虫管理/);
assert.match(appSource, /path="crawlers" element=\{<CrawlersPage \/>/);
assert.match(crawlerPageSource, /export function CrawlersPage/);
assert.match(crawlerPageSource, /SpiderIcon/);
assert.match(crawlerPageSource, /添加爬虫/);
assert.match(crawlerPageSource, /返回列表/);
assert.match(crawlerPageSource, /setMode\("detail"\)/);
assert.match(crawlerPageSource, /setMode\("list"\)/);
assert.match(crawlerPageSource, /api\.listCrawlers/);
assert.match(crawlerPageSource, /api\.upsertCrawler/);
assert.match(crawlerPageSource, /api\.runCrawler/);
assert.match(crawlerPageSource, /api\.stopCrawlerTasks/);
assert.match(crawlerPageSource, /api\.deleteCrawler/);
assert.match(crawlerPageSource, /api\.importCrawlerScriptFile/);
assert.match(crawlerPageSource, /api\.importCrawlerScriptURL/);
assert.match(crawlerPageSource, /type="file"/);
assert.match(crawlerPageSource, /链接导入/);
assert.doesNotMatch(crawlerPageSource, /新建脚本/);
assert.doesNotMatch(crawlerPageSource, /脚本路径/);
assert.doesNotMatch(crawlerPageSource, /Python 解释器/);
assert.doesNotMatch(crawlerPageSource, /自定义配置 JSON/);
assert.doesNotMatch(crawlerPageSource, /Bot/);
assert.match(crawlerPageSource, /builtin:\s*"spider91"/);
assert.match(apiSource, /type AdminCrawler/);
assert.match(apiSource, /"\/crawlers"/);
assert.match(apiSource, /"\/crawlers\/import-file"/);
assert.match(apiSource, /"\/crawlers\/import-url"/);
assert.match(apiSource, /new FormData\(\)/);
assert.doesNotMatch(driveFormSource, /scriptcrawler/);
});
test("drive cards use configured abbreviations and visible fallback icon colors", () => {
assert.match(constantsSource, /googledrive:\s*"GD"/);
assert.match(constantsSource, /function driveKindAbbr\(kind: string\)/);
@@ -230,10 +276,9 @@ test("nightly scan duplicate trigger uses full-scan busy message", () => {
});
test("drive generation panel shows scan or crawler status first", () => {
assert.match(driveComponentsSource, /label=\{d\.kind === "spider91" \? "抓取" : "扫盘"\}/);
assert.match(driveComponentsSource, /label=\{d\.kind === "spider91" \? "已废弃" : "扫盘"\}/);
assert.match(driveComponentsSource, /status=\{d\.scanGenerationStatus\}/);
assert.match(driveComponentsSource, /showCounts=\{false\}/);
assert.match(driveComponentsSource, /label === "抓取" && state === "scanning" \? "抓取中"/);
assert.match(driveComponentsSource, /status\?\.scannedCount/);
assert.match(driveComponentsSource, /预计新增/);
assert.match(apiSource, /scannedCount:\s*number/);
@@ -241,6 +286,13 @@ test("drive generation panel shows scan or crawler status first", () => {
assert.match(constantsSource, /if \(state === "scanning"\) return "扫盘中"/);
});
test("legacy spider91 storage is disabled in drive management", () => {
assert.match(drivesPageSource, /91Spider 不再支持通过网盘运行,请到爬虫管理添加爬虫脚本/);
assert.match(drivesPageSource, /disabled=\{d\.kind === "spider91"\}/);
assert.match(drivesPageSource, /已废弃,请到爬虫管理添加/);
assert.match(constantsSource, /91Spider 不再支持通过网盘添加或编辑/);
});
test("drive detail selection is stored in the URL history", () => {
assert.match(drivesPageSource, /useSearchParams/);
assert.match(drivesPageSource, /searchParams\.get\("drive"\)/);