mirror of
https://github.com/nianzhibai/91.git
synced 2026-06-15 08:45:41 +08:00
feat(crawler): redesign crawler scripts and admin workflow
- add generic scriptcrawler backend runner using the crawler.v1 JSONL protocol - support crawler script upload and HTTP(S) URL import from the admin crawler page - simplify the user-facing crawler contract to title, media_url, optional thumbnail_url and optional source_id - convert Spider91 into a normal script crawler and reject new Spider91 storage-drive configs - keep legacy Spider91 storage rows visible only for cleanup/deletion - add crawler protocol docs, example script, admin UI, tests and migration coverage
This commit is contained in:
@@ -12,6 +12,9 @@
|
||||
pip install requests beautifulsoup4 lxml PySocks
|
||||
|
||||
使用方法:
|
||||
# 作为 video-site-91 通用爬虫脚本运行(后台会自动这样调用)
|
||||
python spider_91porn.py --job /path/to/job.json
|
||||
|
||||
# 全量爬取(默认行为,从 page=1 一直爬到末尾,写到 OUTPUT_FILE)
|
||||
python spider_91porn.py
|
||||
|
||||
@@ -22,6 +25,7 @@
|
||||
python spider_91porn.py --target-new 15 --seen-viewkeys-file /tmp/seen.txt --output /tmp/new.json
|
||||
|
||||
CLI 参数:
|
||||
--job FILE crawler.v1 job JSON 路径;后台爬虫管理会使用此模式
|
||||
--page N 只爬第 N 页,配合 --output 用于手动调试
|
||||
--target-new N 从 page 1 起翻页直到凑够 N 个新视频(不在 seen 列表里的)
|
||||
--seen-viewkeys-file FILE 每行一个已知 viewkey 或 mp4 源 ID,命中即跳过;与 --target-new 配合使用
|
||||
@@ -37,6 +41,8 @@ CLI 参数:
|
||||
- OUTPUT_FILE : 输出文件名
|
||||
|
||||
输出格式 (JSON):
|
||||
--job 模式下 stdout 输出 crawler.v1 JSON Lines,日志全部写到 stderr。
|
||||
手动运行模式仍会写传统 JSON 文件:
|
||||
{
|
||||
"videos": [
|
||||
{
|
||||
@@ -77,8 +83,8 @@ from datetime import datetime
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError:
|
||||
print("错误: 缺少依赖库 beautifulsoup4")
|
||||
print("请运行: pip install beautifulsoup4 lxml")
|
||||
print("错误: 缺少依赖库 beautifulsoup4", file=sys.stderr)
|
||||
print("请运行: pip install beautifulsoup4 lxml", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -148,9 +154,23 @@ OUTPUT_FILE = "91porn_videos.json"
|
||||
MAX_PAGES = None # 设置为 None 爬取所有页,或设置整数如 5 只爬前5页
|
||||
RESUME = True # 是否跳过输出文件中已存在的 viewkey (断点续爬)
|
||||
MAX_EMPTY_PAGES = 2 # 连续空页数达到此值时停止爬取
|
||||
CRAWLER_PROTOCOL = "crawler.v1"
|
||||
# ===================================================
|
||||
|
||||
|
||||
def crawler_source_id(raw: str) -> str:
|
||||
"""Return a backend-safe source_id, preserving existing numeric 91 IDs."""
|
||||
value = str(raw or "").strip()
|
||||
if not value:
|
||||
return ""
|
||||
safe = re.sub(r"[^A-Za-z0-9_.-]+", "_", value).strip("._-")
|
||||
return safe[:160]
|
||||
|
||||
|
||||
def write_jsonl(event: dict):
|
||||
print(json.dumps(event, ensure_ascii=False), flush=True)
|
||||
|
||||
|
||||
class Porn91Spider:
|
||||
def __init__(
|
||||
self,
|
||||
@@ -163,6 +183,7 @@ class Porn91Spider:
|
||||
target_new: int = None,
|
||||
seen_viewkeys: list = None,
|
||||
stream_output: bool = False,
|
||||
stream_protocol: str = "legacy",
|
||||
):
|
||||
"""
|
||||
构造函数。所有参数都有默认值,等同于使用脚本顶部的全局配置。
|
||||
@@ -198,6 +219,7 @@ class Porn91Spider:
|
||||
# (配合 backend Go 端 bufio.Scanner 实时消费,下载一个就开始下一个)。
|
||||
# 开启后所有 log 都走 stderr。
|
||||
self.stream_output = bool(stream_output)
|
||||
self.stream_protocol = stream_protocol or "legacy"
|
||||
|
||||
# 添加重试适配器
|
||||
try:
|
||||
@@ -263,7 +285,28 @@ class Porn91Spider:
|
||||
if not self.stream_output:
|
||||
return
|
||||
try:
|
||||
print(json.dumps(video, ensure_ascii=False), flush=True)
|
||||
if self.stream_protocol == "crawler.v1":
|
||||
source_id = crawler_source_id(video.get("source_id") or video.get("viewkey") or "")
|
||||
item = {
|
||||
"title": video.get("title") or "",
|
||||
"detail_url": video.get("detail_url") or "",
|
||||
"author": "91porn",
|
||||
"tags": ["91porn"],
|
||||
"media_url": video.get("video_url") or "",
|
||||
"thumbnail_url": video.get("thumb_url") or "",
|
||||
"headers": {
|
||||
"Referer": video.get("detail_url") or BASE_URL,
|
||||
},
|
||||
}
|
||||
if source_id:
|
||||
item["source_id"] = source_id
|
||||
event = {
|
||||
"type": "item",
|
||||
"item": item,
|
||||
}
|
||||
write_jsonl(event)
|
||||
else:
|
||||
print(json.dumps(video, ensure_ascii=False), flush=True)
|
||||
except Exception as e:
|
||||
# stdout 异常基本只在管道断开时发生(消费方进程死了);
|
||||
# 写到 stderr 让 backend 看到,然后让 crawl 循环自己 break。
|
||||
@@ -697,8 +740,9 @@ class Porn91Spider:
|
||||
except Exception as e:
|
||||
self.log(f"保存文件失败: {e}")
|
||||
# 尝试输出到控制台作为备份
|
||||
print("\n--- 备份输出 ---")
|
||||
print(json.dumps(output_data, ensure_ascii=False, indent=2))
|
||||
backup_out = sys.stderr if self.stream_output else sys.stdout
|
||||
print("\n--- 备份输出 ---", file=backup_out, flush=True)
|
||||
print(json.dumps(output_data, ensure_ascii=False, indent=2), file=backup_out, flush=True)
|
||||
|
||||
def _print_summary(self):
|
||||
"""
|
||||
@@ -751,6 +795,84 @@ def print_help():
|
||||
""")
|
||||
|
||||
|
||||
def run_job(job_path: str):
|
||||
"""Run as a crawler.v1 script plugin.
|
||||
|
||||
The Go host passes a job JSON file and expects stdout JSONL events. Logs go
|
||||
to stderr so stdout stays machine-readable.
|
||||
"""
|
||||
with open(job_path, "r", encoding="utf-8") as f:
|
||||
job = json.load(f)
|
||||
|
||||
if job.get("protocol") != CRAWLER_PROTOCOL:
|
||||
raise ValueError(f"unsupported crawler protocol: {job.get('protocol')!r}")
|
||||
if job.get("mode") not in ("", None, "crawl"):
|
||||
raise ValueError(f"unsupported crawler mode: {job.get('mode')!r}")
|
||||
|
||||
try:
|
||||
target_new = int(job.get("target_new") or 15)
|
||||
except (TypeError, ValueError):
|
||||
target_new = 15
|
||||
if target_new <= 0:
|
||||
target_new = 15
|
||||
seen_file = job.get("seen_source_ids_file") or ""
|
||||
output_dir = job.get("output_dir") or os.getcwd()
|
||||
run_id = job.get("run_id") or datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_file = os.path.join(output_dir, f"spider91-{run_id}.json")
|
||||
|
||||
network = job.get("network") if isinstance(job.get("network"), dict) else {}
|
||||
proxy_url = str(network.get("proxy_url") or "").strip()
|
||||
if proxy_url:
|
||||
os.environ["HTTP_PROXY"] = proxy_url
|
||||
os.environ["HTTPS_PROXY"] = proxy_url
|
||||
os.environ["http_proxy"] = proxy_url
|
||||
os.environ["https_proxy"] = proxy_url
|
||||
os.environ["NO_PROXY"] = ""
|
||||
os.environ["no_proxy"] = ""
|
||||
|
||||
seen_viewkeys = []
|
||||
if seen_file:
|
||||
try:
|
||||
with open(seen_file, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
seen_viewkeys.append(line)
|
||||
except FileNotFoundError:
|
||||
print(f"警告: seen_source_ids_file 不存在: {seen_file}", file=sys.stderr, flush=True)
|
||||
except Exception as e:
|
||||
print(f"警告: 读取 seen_source_ids_file 失败: {e}", file=sys.stderr, flush=True)
|
||||
|
||||
prefer_ipv4_for_plain_socks5_proxy()
|
||||
spider = Porn91Spider(
|
||||
output_file=output_file,
|
||||
start_page=1,
|
||||
max_pages=None,
|
||||
resume=False,
|
||||
quiet=True,
|
||||
target_new=target_new,
|
||||
seen_viewkeys=seen_viewkeys,
|
||||
stream_output=True,
|
||||
stream_protocol="crawler.v1",
|
||||
)
|
||||
try:
|
||||
spider.crawl()
|
||||
done = {
|
||||
"type": "done",
|
||||
"stats": {
|
||||
"emitted": spider.processed_videos,
|
||||
"failed": spider.failed_videos,
|
||||
"skipped": spider.skipped_videos,
|
||||
},
|
||||
}
|
||||
write_jsonl(done)
|
||||
except KeyboardInterrupt:
|
||||
spider.log("\n用户中断,正在保存已爬取的数据...")
|
||||
spider._save_results()
|
||||
raise
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) > 1 and sys.argv[1] in ('-h', '--help', 'help'):
|
||||
print_help()
|
||||
@@ -778,8 +900,14 @@ def main():
|
||||
parser.add_argument("--stream-output", action="store_true",
|
||||
help="流式模式:每解析一条视频直链就立即把它作为一行 JSON 写到 stdout 并 flush;"
|
||||
"日志改走 stderr。配合 backend 边读边下载使用。")
|
||||
parser.add_argument("--job", type=str, default=None,
|
||||
help="crawler.v1 job JSON 路径;作为通用脚本爬虫运行。")
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
if args.job:
|
||||
run_job(args.job)
|
||||
return
|
||||
|
||||
cli_out = sys.stderr if args.stream_output else sys.stdout
|
||||
prefer_ipv4_for_plain_socks5_proxy()
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@ internal/
|
||||
onedrive/ OneDrive(OpenList 在线续期 + Microsoft Graph 文件接口)
|
||||
googledrive/ Google Drive(OpenList 在线续期 + Google Drive API;播放走后端代理)
|
||||
localstorage/ 本地目录扫描(服务器已有视频目录)
|
||||
scriptcrawler/ 通用脚本爬虫输出的本地媒体适配层
|
||||
scanner/ 扫目录 → 落库
|
||||
preview/ ffmpeg 抽封面和生成多段预览视频
|
||||
proxy/ /p/stream/*、/p/preview/* 代理
|
||||
@@ -79,6 +80,12 @@ npm run preview 前端 9191,无热更新
|
||||
go run ./cmd/server 后端 9192
|
||||
```
|
||||
|
||||
## 爬虫脚本
|
||||
|
||||
爬虫现在是独立后台栏目 `/admin/crawlers`,不再作为“网盘/存储类型”配置。脚本负责发现视频,后端负责去重、下载、入库、封面、预览视频和视频指纹。
|
||||
|
||||
脚本协议见 [docs/crawler-protocol.md](../docs/crawler-protocol.md)。后台支持上传 `.py` 文件或通过 HTTP(S) 脚本链接导入,导入后的脚本会保存到数据目录旁的 `crawler-scripts/`。内置 91 爬虫也支持同一套 `crawler.v1` job 协议;后台“内置 91”会自动使用仓库里的 `91VideoSpider/spider_91porn.py`。
|
||||
|
||||
## 添加一个盘
|
||||
|
||||
推荐在前端管理后台 `/admin/drives` 新增网盘。保存后会立即挂载并触发扫描;视频结果可在 `/admin/videos` 按网盘查看,每页 100 条,页面会同时显示各网盘预览视频已生成、待生成、失败数量。
|
||||
|
||||
+156
-95
@@ -33,6 +33,7 @@ import (
|
||||
"github.com/video-site/backend/internal/drives/p123"
|
||||
"github.com/video-site/backend/internal/drives/pikpak"
|
||||
"github.com/video-site/backend/internal/drives/quark"
|
||||
"github.com/video-site/backend/internal/drives/scriptcrawler"
|
||||
"github.com/video-site/backend/internal/drives/spider91"
|
||||
"github.com/video-site/backend/internal/drives/wopan"
|
||||
"github.com/video-site/backend/internal/fingerprint"
|
||||
@@ -45,6 +46,7 @@ import (
|
||||
)
|
||||
|
||||
const fingerprintReconcileInterval = time.Minute
|
||||
const legacySpider91DriveUnsupported = "91Spider 已不再支持作为网盘配置,请在爬虫管理页面添加爬虫脚本"
|
||||
|
||||
func main() {
|
||||
cfgPath := "./config.yaml"
|
||||
@@ -76,7 +78,7 @@ func main() {
|
||||
workers: make(map[string]*preview.Worker),
|
||||
thumbWorkers: make(map[string]*preview.ThumbWorker),
|
||||
fingerprintWorkers: make(map[string]*fingerprint.Worker),
|
||||
spider91Crawlers: make(map[string]*spider91.Crawler),
|
||||
scriptCrawlers: make(map[string]*scriptcrawler.Crawler),
|
||||
}
|
||||
app.proxy = proxy.New(app.registry)
|
||||
app.spider91Migrator = spider91migrate.New(spider91migrate.Config{
|
||||
@@ -171,13 +173,23 @@ func main() {
|
||||
app.detachDrive(driveID)
|
||||
},
|
||||
OnScanRequested: func(driveID string) bool {
|
||||
// spider91 的"重扫"等同于手动触发一次爬取;其它 drive 走标准 scan
|
||||
app.mu.Lock()
|
||||
_, isSpider91 := app.spider91Crawlers[driveID]
|
||||
app.mu.Unlock()
|
||||
// 爬虫类 drive 的"重扫"等同于手动触发一次爬取;其它 drive 走标准 scan
|
||||
isSpider91 := false
|
||||
isScriptCrawler := false
|
||||
if d, err := app.cat.GetDrive(ctx, driveID); err == nil && d != nil {
|
||||
if d.Kind == spider91.Kind {
|
||||
log.Printf("[spider91] drive=%s is a deprecated storage crawler, ignore scan request", driveID)
|
||||
return false
|
||||
}
|
||||
isSpider91 = scriptCrawlerSourceKindForDrive(d) == spider91.Kind
|
||||
isScriptCrawler = d.Kind == scriptcrawler.Kind
|
||||
}
|
||||
if isSpider91 {
|
||||
return app.scheduleSpider91Crawl(ctx, driveID)
|
||||
}
|
||||
if isScriptCrawler {
|
||||
return app.scheduleScriptCrawlerCrawl(ctx, driveID)
|
||||
}
|
||||
return app.scheduleScan(ctx, driveID)
|
||||
},
|
||||
OnStopDriveTasks: func(driveID string) bool {
|
||||
@@ -227,6 +239,9 @@ func main() {
|
||||
SetSpider91UploadDriveID: func(id string) error {
|
||||
return app.SetSpider91UploadDriveID(ctx, id)
|
||||
},
|
||||
DefaultSpider91ScriptPath: func() string {
|
||||
return app.defaultSpider91ScriptPath()
|
||||
},
|
||||
OnRunNightlyJob: func() bool {
|
||||
if app.nightlyRunner != nil {
|
||||
return app.nightlyRunner.TriggerNow()
|
||||
@@ -304,8 +319,9 @@ type App struct {
|
||||
thumbWorkers map[string]*preview.ThumbWorker
|
||||
fingerprintWorkers map[string]*fingerprint.Worker
|
||||
cancels map[string]context.CancelFunc
|
||||
// spider91Crawlers 按 driveID 索引,每个 spider91 drive 独立一个 Crawler
|
||||
spider91Crawlers map[string]*spider91.Crawler
|
||||
// scriptCrawlers 按 driveID 索引,每个脚本爬虫 drive 独立一个 Crawler。
|
||||
// 内置 Spider91 也走这里,只是 SourceKind=spider91,以兼容历史 video id。
|
||||
scriptCrawlers map[string]*scriptcrawler.Crawler
|
||||
|
||||
// driveAttachMu 串行化云盘挂载/重挂载。挂载会访问上游服务,可能较慢;
|
||||
// 串行化可以避免启动后台挂载和手动扫盘按需挂载同一个 drive 时重复创建 worker。
|
||||
@@ -737,11 +753,16 @@ func (a *App) attachDriveUnlocked(ctx context.Context, d *catalog.Drive) error {
|
||||
ID: d.ID,
|
||||
RootPath: d.Credentials["path"],
|
||||
})
|
||||
case spider91.Kind:
|
||||
drv = spider91.New(spider91.Config{
|
||||
case scriptcrawler.Kind:
|
||||
drv = scriptcrawler.New(scriptcrawler.Config{
|
||||
ID: d.ID,
|
||||
RootDir: a.spider91DriveDir(d.ID),
|
||||
RootDir: a.scriptCrawlerDriveDirForDrive(d),
|
||||
})
|
||||
case spider91.Kind:
|
||||
d.Status = "error"
|
||||
d.LastError = legacySpider91DriveUnsupported
|
||||
_ = a.cat.UpsertDrive(ctx, d)
|
||||
return errors.New(legacySpider91DriveUnsupported)
|
||||
default:
|
||||
return fmt.Errorf("unknown drive kind: %s", d.Kind)
|
||||
}
|
||||
@@ -761,9 +782,8 @@ func (a *App) attachDriveUnlocked(ctx context.Context, d *catalog.Drive) error {
|
||||
|
||||
a.startDriveGenerationWorkers(ctx, d.ID, drv, true)
|
||||
|
||||
// spider91 driver 还需要一个 crawler,挂在专用 map 里供 crawlerLoop 调用
|
||||
if sd, ok := drv.(*spider91.Driver); ok {
|
||||
a.attachSpider91Crawler(d, sd)
|
||||
if sd, ok := drv.(*scriptcrawler.Driver); ok {
|
||||
a.attachScriptCrawler(d, sd)
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -836,6 +856,26 @@ func (a *App) spider91DriveDir(driveID string) string {
|
||||
return filepath.Join(a.spider91RootDir(), driveID)
|
||||
}
|
||||
|
||||
// scriptCrawlerRootDir 是所有通用脚本爬虫 drive 共享的根目录。
|
||||
func (a *App) scriptCrawlerRootDir() string {
|
||||
return filepath.Join(filepath.Dir(a.cfg.Storage.LocalPreviewDir), "scriptcrawlers")
|
||||
}
|
||||
|
||||
// scriptCrawlerDriveDir 是单个 scriptcrawler drive 的存储目录:<root>/<driveID>。
|
||||
func (a *App) scriptCrawlerDriveDir(driveID string) string {
|
||||
return filepath.Join(a.scriptCrawlerRootDir(), driveID)
|
||||
}
|
||||
|
||||
func (a *App) scriptCrawlerDriveDirForDrive(d *catalog.Drive) string {
|
||||
if d != nil && scriptCrawlerSourceKindForDrive(d) == spider91.Kind {
|
||||
return a.spider91DriveDir(d.ID)
|
||||
}
|
||||
if d == nil {
|
||||
return a.scriptCrawlerDriveDir("")
|
||||
}
|
||||
return a.scriptCrawlerDriveDir(d.ID)
|
||||
}
|
||||
|
||||
// commonThumbsDir 是所有 drive 共享的封面目录,/p/thumb/{videoID} 路由命中这里。
|
||||
func (a *App) commonThumbsDir() string {
|
||||
return filepath.Join(a.cfg.Storage.LocalPreviewDir, "thumbs")
|
||||
@@ -865,77 +905,72 @@ func (a *App) defaultSpider91ScriptPath() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
// attachSpider91Crawler 创建该 drive 对应的 Crawler 并注册到 a.spider91Crawlers。
|
||||
func (a *App) attachSpider91Crawler(d *catalog.Drive, drv *spider91.Driver) {
|
||||
// attachScriptCrawler 创建通用脚本爬虫 runner,并注册到 a.scriptCrawlers。
|
||||
func (a *App) attachScriptCrawler(d *catalog.Drive, drv *scriptcrawler.Driver) {
|
||||
pythonPath := strings.TrimSpace(d.Credentials["python_path"])
|
||||
if pythonPath == "" {
|
||||
pythonPath = "python3"
|
||||
}
|
||||
scriptPath := strings.TrimSpace(d.Credentials["script_path"])
|
||||
if scriptPath == "" {
|
||||
sourceKind := scriptCrawlerSourceKindForDrive(d)
|
||||
if scriptPath == "" && sourceKind == spider91.Kind {
|
||||
scriptPath = a.defaultSpider91ScriptPath()
|
||||
}
|
||||
// 91porn CDN 在海外;空缺时回退到 HTTPS_PROXY / HTTP_PROXY 环境变量。
|
||||
proxyURL := strings.TrimSpace(d.Credentials["proxy"])
|
||||
configJSON := strings.TrimSpace(d.Credentials["config_json"])
|
||||
workDir := ""
|
||||
if scriptPath != "" {
|
||||
workDir = filepath.Dir(scriptPath)
|
||||
}
|
||||
|
||||
driveID := d.ID
|
||||
var progressMu sync.Mutex
|
||||
checkedVideos := 0
|
||||
expectedNewVideos := 0
|
||||
updateProgress := func(scanned, added int) {
|
||||
a.updateDriveScanProgress(driveID, scanned, added)
|
||||
}
|
||||
c := spider91.NewCrawler(spider91.CrawlerConfig{
|
||||
c := scriptcrawler.NewCrawler(scriptcrawler.CrawlerConfig{
|
||||
Driver: drv,
|
||||
Catalog: a.cat,
|
||||
SourceKind: sourceKind,
|
||||
PythonPath: pythonPath,
|
||||
ScriptPath: scriptPath,
|
||||
WorkDir: filepath.Dir(scriptPath),
|
||||
WorkDir: workDir,
|
||||
CommonThumbDir: a.commonThumbsDir(),
|
||||
ProxyURL: proxyURL,
|
||||
OnProgress: func(progress spider91.CrawlProgress) {
|
||||
progressMu.Lock()
|
||||
if progress.TotalEntries == 0 && progress.NewVideos == 0 && progress.Skipped == 0 && progress.Failed == 0 {
|
||||
checkedVideos = 0
|
||||
expectedNewVideos = 0
|
||||
} else if progress.TotalEntries > expectedNewVideos {
|
||||
expectedNewVideos = progress.TotalEntries
|
||||
ConfigJSON: configJSON,
|
||||
OnProgress: func(progress scriptcrawler.CrawlProgress) {
|
||||
scanned := progress.Checked
|
||||
if scanned < progress.TotalEntries {
|
||||
scanned = progress.TotalEntries
|
||||
}
|
||||
scanned := checkedVideos
|
||||
added := expectedNewVideos
|
||||
progressMu.Unlock()
|
||||
updateProgress(scanned, added)
|
||||
added := progress.Emitted
|
||||
if added < progress.NewVideos {
|
||||
added = progress.NewVideos
|
||||
}
|
||||
a.updateDriveScanProgress(driveID, scanned, added)
|
||||
},
|
||||
OnCheckedVideo: func() {
|
||||
progressMu.Lock()
|
||||
checkedVideos++
|
||||
scanned := checkedVideos
|
||||
added := expectedNewVideos
|
||||
progressMu.Unlock()
|
||||
updateProgress(scanned, added)
|
||||
},
|
||||
OnExtractedVideo: func() {
|
||||
progressMu.Lock()
|
||||
expectedNewVideos++
|
||||
scanned := checkedVideos
|
||||
added := expectedNewVideos
|
||||
progressMu.Unlock()
|
||||
updateProgress(scanned, added)
|
||||
},
|
||||
// 新流程:预览视频不在每条视频入库时立即入队,而是 RunOnce 全部下完后由
|
||||
// runSpider91Crawl 统一调 enqueueDriveGeneration 一次性入队。这样:
|
||||
// - 下载阶段不和 ffmpeg 抢 CPU/IO
|
||||
// - "等待预览视频队列 idle" 在 nightly Phase 2 的语义上更直观
|
||||
// 不再传 OnNewVideo(crawler 内部的回调字段保留,仅为单测计数器之用)。
|
||||
})
|
||||
|
||||
a.mu.Lock()
|
||||
a.spider91Crawlers[driveID] = c
|
||||
a.scriptCrawlers[driveID] = c
|
||||
a.mu.Unlock()
|
||||
|
||||
// 确保 "91porn" 系统标签存在,并按 spider91 来源前缀给历史视频补打。
|
||||
// 不能只靠文本匹配:老版本入库的视频可能没有 author/tags 字段,但 id 前缀
|
||||
// "spider91-<driveID>-" 会一直保留,即使后续迁移到 PikPak/115 也不变。
|
||||
if sourceKind == spider91.Kind {
|
||||
a.ensureSpider91SourceTag(driveID)
|
||||
}
|
||||
}
|
||||
|
||||
func scriptCrawlerSourceKindForDrive(d *catalog.Drive) string {
|
||||
if d == nil {
|
||||
return scriptcrawler.Kind
|
||||
}
|
||||
if d.Kind == scriptcrawler.Kind && strings.EqualFold(strings.TrimSpace(d.Credentials["builtin"]), spider91.Kind) {
|
||||
return spider91.Kind
|
||||
}
|
||||
return scriptcrawler.Kind
|
||||
}
|
||||
|
||||
func isSpider91SourceDrive(d *catalog.Drive) bool {
|
||||
return d != nil && (strings.EqualFold(d.Kind, spider91.Kind) || scriptCrawlerSourceKindForDrive(d) == spider91.Kind)
|
||||
}
|
||||
|
||||
func (a *App) ensureSpider91SourceTag(driveID string) {
|
||||
bgCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
go func() {
|
||||
defer cancel()
|
||||
@@ -1455,7 +1490,7 @@ func (a *App) detachDrive(id string) {
|
||||
delete(a.workers, id)
|
||||
delete(a.thumbWorkers, id)
|
||||
delete(a.fingerprintWorkers, id)
|
||||
delete(a.spider91Crawlers, id)
|
||||
delete(a.scriptCrawlers, id)
|
||||
a.mu.Unlock()
|
||||
}
|
||||
|
||||
@@ -1611,7 +1646,7 @@ func (a *App) runScanWithTaskContext(ctx context.Context, driveID string) {
|
||||
// spider91 / localupload 走自己的生命周期管理,不应该参与扫描清理;
|
||||
// stats.Errors > 0 时(云盘 API 中途抖动)保守起见跳过这一轮,避免把
|
||||
// "暂时列不出来"误认成"被用户删了"。
|
||||
if drv.Kind() != spider91.Kind && drv.ID() != localupload.DriveID {
|
||||
if drv.Kind() != spider91.Kind && drv.Kind() != scriptcrawler.Kind && drv.ID() != localupload.DriveID {
|
||||
if stats.Errors > 0 {
|
||||
log.Printf("[cleanup] skip stale cleanup for drive=%s kind=%s: scan had %d directory errors", driveID, drv.Kind(), stats.Errors)
|
||||
} else {
|
||||
@@ -1736,7 +1771,7 @@ func (a *App) spider91OriginFromVideo(ctx context.Context, v *catalog.Video) (st
|
||||
if a == nil || v == nil {
|
||||
return "", ""
|
||||
}
|
||||
if d, err := a.cat.GetDrive(ctx, v.DriveID); err == nil && d != nil && d.Kind == spider91.Kind {
|
||||
if d, err := a.cat.GetDrive(ctx, v.DriveID); err == nil && d != nil && isSpider91SourceDrive(d) {
|
||||
prefix := "spider91-" + d.ID + "-"
|
||||
if strings.HasPrefix(v.ID, prefix) {
|
||||
return d.ID, strings.TrimPrefix(v.ID, prefix)
|
||||
@@ -1749,7 +1784,7 @@ func (a *App) spider91OriginFromVideo(ctx context.Context, v *catalog.Video) (st
|
||||
bestDriveID := ""
|
||||
bestSourceID := ""
|
||||
for _, d := range drives {
|
||||
if d == nil || d.Kind != spider91.Kind {
|
||||
if d == nil || !isSpider91SourceDrive(d) {
|
||||
continue
|
||||
}
|
||||
prefix := "spider91-" + d.ID + "-"
|
||||
@@ -1839,7 +1874,7 @@ func (a *App) cleanupDriveVideosForDelete(ctx context.Context, driveID string) (
|
||||
}
|
||||
}
|
||||
|
||||
if strings.EqualFold(d.Kind, spider91.Kind) {
|
||||
if isSpider91SourceDrive(d) {
|
||||
if err := a.removeSpider91DriveDir(driveID); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
@@ -1924,7 +1959,7 @@ func (a *App) videosForDriveDelete(ctx context.Context, d *catalog.Drive) ([]*ca
|
||||
byID[v.ID] = v
|
||||
}
|
||||
|
||||
if strings.EqualFold(d.Kind, spider91.Kind) {
|
||||
if isSpider91SourceDrive(d) {
|
||||
prefix := "spider91-" + d.ID + "-"
|
||||
originItems, err := a.cat.ListVideosByIDPrefix(ctx, prefix)
|
||||
if err != nil {
|
||||
@@ -2380,7 +2415,7 @@ func (a *App) regenFailedFingerprints(ctx context.Context, driveID string) {
|
||||
}
|
||||
|
||||
// listScanTargetIDs 返回 nightly Phase 1 应扫描的所有 drive ID
|
||||
// (非 spider91、非 localupload)。它直接读 catalog,而不是 registry,这样
|
||||
// (非爬虫、非 localupload)。它直接读 catalog,而不是 registry,这样
|
||||
// 进程刚启动、云盘还在后台挂载时,nightly 也不会漏掉配置过的 drive。
|
||||
func (a *App) listScanTargetIDs(ctx context.Context) []string {
|
||||
all, err := a.cat.ListDrives(ctx)
|
||||
@@ -2390,7 +2425,7 @@ func (a *App) listScanTargetIDs(ctx context.Context) []string {
|
||||
}
|
||||
out := make([]string, 0, len(all))
|
||||
for _, d := range all {
|
||||
if d == nil || d.ID == localupload.DriveID || d.Kind == spider91.Kind {
|
||||
if d == nil || d.ID == localupload.DriveID || d.Kind == spider91.Kind || d.Kind == scriptcrawler.Kind {
|
||||
continue
|
||||
}
|
||||
out = append(out, d.ID)
|
||||
@@ -2398,7 +2433,7 @@ func (a *App) listScanTargetIDs(ctx context.Context) []string {
|
||||
return out
|
||||
}
|
||||
|
||||
// listSpider91DriveIDs 返回 nightly Phase 2 应触发爬取的 spider91 drive ID 列表。
|
||||
// listSpider91DriveIDs 返回 nightly Phase 2 应触发爬取的爬虫 drive ID 列表。
|
||||
func (a *App) listSpider91DriveIDs(ctx context.Context) []string {
|
||||
all, err := a.cat.ListDrives(ctx)
|
||||
if err != nil {
|
||||
@@ -2407,7 +2442,7 @@ func (a *App) listSpider91DriveIDs(ctx context.Context) []string {
|
||||
}
|
||||
out := make([]string, 0, len(all))
|
||||
for _, d := range all {
|
||||
if d != nil && d.Kind == spider91.Kind {
|
||||
if d != nil && d.Kind == scriptcrawler.Kind {
|
||||
out = append(out, d.ID)
|
||||
}
|
||||
}
|
||||
@@ -2449,8 +2484,8 @@ func shouldScanDrive(d drives.Drive) bool {
|
||||
if d == nil || d.ID() == localupload.DriveID {
|
||||
return false
|
||||
}
|
||||
// spider91 由专用的 crawlerLoop 触发,不参与 scanLoop
|
||||
if d.Kind() == spider91.Kind {
|
||||
// 爬虫类 drive 由专用 crawl 阶段触发,不参与普通 scan
|
||||
if d.Kind() == spider91.Kind || d.Kind() == scriptcrawler.Kind {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
@@ -2481,65 +2516,96 @@ func (a *App) scheduleSpider91Crawl(ctx context.Context, driveID string) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (a *App) scheduleScriptCrawlerCrawl(ctx context.Context, driveID string) bool {
|
||||
if a.driveHasActiveWork(driveID) {
|
||||
log.Printf("[scriptcrawler] drive=%s has active work, skip duplicate crawl request", driveID)
|
||||
return false
|
||||
}
|
||||
if !a.beginDriveScanOrCrawl(driveID) {
|
||||
log.Printf("[scriptcrawler] drive=%s already queued or running, skip duplicate crawl request", driveID)
|
||||
return false
|
||||
}
|
||||
taskCtx, done := a.registerDriveTaskContext(ctx, driveID)
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
a.endDriveScanOrCrawl(driveID)
|
||||
done()
|
||||
}()
|
||||
a.runScriptCrawlerCrawlWithTaskContext(taskCtx, driveID)
|
||||
}()
|
||||
return true
|
||||
}
|
||||
|
||||
// runSpider91Crawl 运行一次完整爬取流程并把 last_crawl_at 写回 drive.credentials。
|
||||
//
|
||||
// 即使爬取失败也会更新 last_crawl_at,避免一直在错误循环里反复触发;下一次 nightly
|
||||
// 流水线重跑时仍会重试。该方法是阻塞的,被 nightly Phase 2 串行调用,以及被
|
||||
// admin "立即抓取" 单 drive 异步调用。
|
||||
func (a *App) runSpider91Crawl(ctx context.Context, driveID string) {
|
||||
a.runScriptCrawlerCrawl(ctx, driveID)
|
||||
}
|
||||
|
||||
func (a *App) runScriptCrawlerCrawl(ctx context.Context, driveID string) {
|
||||
if !a.beginDriveScanOrCrawl(driveID) {
|
||||
log.Printf("[spider91] drive=%s already queued or running, skip direct crawl", driveID)
|
||||
log.Printf("[scriptcrawler] drive=%s already queued or running, skip direct crawl", driveID)
|
||||
return
|
||||
}
|
||||
defer a.endDriveScanOrCrawl(driveID)
|
||||
taskCtx, done := a.registerDriveTaskContext(ctx, driveID)
|
||||
defer done()
|
||||
a.runSpider91CrawlWithTaskContext(taskCtx, driveID)
|
||||
a.runScriptCrawlerCrawlWithTaskContext(taskCtx, driveID)
|
||||
}
|
||||
|
||||
func (a *App) runSpider91CrawlWithTaskContext(ctx context.Context, driveID string) bool {
|
||||
return a.runScriptCrawlerCrawlWithTaskContext(ctx, driveID)
|
||||
}
|
||||
|
||||
func (a *App) runScriptCrawlerCrawlWithTaskContext(ctx context.Context, driveID string) bool {
|
||||
if err := ctx.Err(); err != nil {
|
||||
log.Printf("[spider91] drive=%s crawl canceled before start: %v", driveID, err)
|
||||
log.Printf("[scriptcrawler] drive=%s crawl canceled before start: %v", driveID, err)
|
||||
return false
|
||||
}
|
||||
a.mu.Lock()
|
||||
c := a.spider91Crawlers[driveID]
|
||||
c := a.scriptCrawlers[driveID]
|
||||
a.mu.Unlock()
|
||||
if c == nil {
|
||||
if err := a.ensureDriveAttached(ctx, driveID); err != nil {
|
||||
log.Printf("[spider91] drive=%s attach failed: %v", driveID, err)
|
||||
log.Printf("[scriptcrawler] drive=%s attach failed: %v", driveID, err)
|
||||
return false
|
||||
}
|
||||
a.mu.Lock()
|
||||
c = a.spider91Crawlers[driveID]
|
||||
c = a.scriptCrawlers[driveID]
|
||||
a.mu.Unlock()
|
||||
if c == nil {
|
||||
log.Printf("[spider91] drive=%s crawler not attached", driveID)
|
||||
log.Printf("[scriptcrawler] drive=%s crawler not attached", driveID)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
d, err := a.cat.GetDrive(ctx, driveID)
|
||||
if err != nil || d == nil {
|
||||
log.Printf("[spider91] drive=%s lookup failed: %v", driveID, err)
|
||||
log.Printf("[scriptcrawler] drive=%s lookup failed: %v", driveID, err)
|
||||
return false
|
||||
}
|
||||
targetNew := spider91IntCred(d, "target_new", spider91.DefaultTargetNew)
|
||||
defaultTargetNew := scriptcrawler.DefaultTargetNew
|
||||
if scriptCrawlerSourceKindForDrive(d) == spider91.Kind {
|
||||
defaultTargetNew = spider91.DefaultTargetNew
|
||||
}
|
||||
targetNew := spider91IntCred(d, "target_new", defaultTargetNew)
|
||||
if targetNew <= 0 {
|
||||
targetNew = spider91.DefaultTargetNew
|
||||
targetNew = defaultTargetNew
|
||||
}
|
||||
|
||||
log.Printf("[spider91] drive=%s start crawl target_new=%d", driveID, targetNew)
|
||||
log.Printf("[scriptcrawler] drive=%s start crawl target_new=%d", driveID, targetNew)
|
||||
res, runErr := c.RunOnce(ctx, targetNew)
|
||||
if runErr != nil {
|
||||
log.Printf("[spider91] drive=%s crawl failed: %v", driveID, runErr)
|
||||
log.Printf("[scriptcrawler] drive=%s crawl failed: %v", driveID, runErr)
|
||||
} else if res != nil {
|
||||
log.Printf("[spider91] drive=%s crawl done target=%d total=%d new=%d skipped=%d failed=%d seen_snapshot=%d",
|
||||
log.Printf("[scriptcrawler] drive=%s crawl done target=%d total=%d new=%d skipped=%d failed=%d seen_snapshot=%d",
|
||||
driveID, res.TargetNew, res.TotalEntries, res.NewVideos, res.Skipped, res.Failed, res.SeenSnapshot)
|
||||
}
|
||||
|
||||
// 标记最后一次爬取时间。这字段已不再用于调度判定(nightly 流水线统一调度),
|
||||
// 留着仅作为 admin UI 显示"上次抓取 N 小时前"用。
|
||||
if d.Credentials == nil {
|
||||
d.Credentials = make(map[string]string)
|
||||
}
|
||||
@@ -2552,18 +2618,13 @@ func (a *App) runSpider91CrawlWithTaskContext(ctx context.Context, driveID strin
|
||||
d.LastError = ""
|
||||
}
|
||||
if err := a.cat.UpsertDrive(ctx, d); err != nil {
|
||||
log.Printf("[spider91] drive=%s update last_crawl_at: %v", driveID, err)
|
||||
log.Printf("[scriptcrawler] drive=%s update last_crawl_at: %v", driveID, err)
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
log.Printf("[spider91] drive=%s crawl canceled after run: %v", driveID, err)
|
||||
log.Printf("[scriptcrawler] drive=%s crawl canceled after run: %v", driveID, err)
|
||||
return false
|
||||
}
|
||||
|
||||
// 爬取全部完成后,统一把所有还 pending 的预览视频入队。
|
||||
// 这是新流水线设计:crawler 自身不再每条入库就立即触发预览视频生成,
|
||||
// 让"下载阶段"和"预览视频阶段"在时间上分清楚(也跟 nightly Phase 2
|
||||
// 的"等预览视频队列 idle"语义对齐)。enqueueDriveGeneration 内部会读
|
||||
// 该 drive 当前的 teaser_enabled,关闭时是 noop。
|
||||
a.mu.Lock()
|
||||
worker := a.workers[driveID]
|
||||
thumbWorker := a.thumbWorkers[driveID]
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
@@ -13,6 +14,7 @@ import (
|
||||
"github.com/video-site/backend/internal/catalog"
|
||||
"github.com/video-site/backend/internal/config"
|
||||
"github.com/video-site/backend/internal/drives"
|
||||
"github.com/video-site/backend/internal/drives/scriptcrawler"
|
||||
"github.com/video-site/backend/internal/drives/spider91"
|
||||
"github.com/video-site/backend/internal/fingerprint"
|
||||
"github.com/video-site/backend/internal/preview"
|
||||
@@ -606,7 +608,8 @@ func TestNightlyTargetsComeFromCatalogBeforeDriveAttach(t *testing.T) {
|
||||
for _, d := range []*catalog.Drive{
|
||||
{ID: "115", Kind: "p115", Name: "115", RootID: "0", TeaserEnabled: true},
|
||||
{ID: "pikpak", Kind: "pikpak", Name: "PikPak", RootID: "0", TeaserEnabled: true},
|
||||
{ID: "91-spider", Kind: "spider91", Name: "91 Spider", RootID: "0", TeaserEnabled: true},
|
||||
{ID: "91-legacy", Kind: "spider91", Name: "91 Legacy", RootID: "0", TeaserEnabled: true},
|
||||
{ID: "91-crawler", Kind: scriptcrawler.Kind, Name: "91 Spider", RootID: "/", TeaserEnabled: true},
|
||||
} {
|
||||
if err := cat.UpsertDrive(ctx, d); err != nil {
|
||||
t.Fatalf("seed drive %s: %v", d.ID, err)
|
||||
@@ -619,8 +622,47 @@ func TestNightlyTargetsComeFromCatalogBeforeDriveAttach(t *testing.T) {
|
||||
t.Fatalf("scan target ids = %#v, want 115 and pikpak from catalog", scanIDs)
|
||||
}
|
||||
spiderIDs := app.listSpider91DriveIDs(ctx)
|
||||
if len(spiderIDs) != 1 || spiderIDs[0] != "91-spider" {
|
||||
t.Fatalf("spider91 ids = %#v, want catalog spider drive", spiderIDs)
|
||||
if len(spiderIDs) != 1 || spiderIDs[0] != "91-crawler" {
|
||||
t.Fatalf("spider91 ids = %#v, want crawler-page script drive", spiderIDs)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAttachDriveRejectsLegacySpider91Storage(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
|
||||
if err != nil {
|
||||
t.Fatalf("open catalog: %v", err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
if err := cat.Close(); err != nil {
|
||||
t.Fatalf("close catalog: %v", err)
|
||||
}
|
||||
})
|
||||
d := &catalog.Drive{
|
||||
ID: "91-legacy",
|
||||
Kind: spider91.Kind,
|
||||
Name: "91 Legacy",
|
||||
RootID: "/",
|
||||
TeaserEnabled: true,
|
||||
}
|
||||
if err := cat.UpsertDrive(ctx, d); err != nil {
|
||||
t.Fatalf("seed drive: %v", err)
|
||||
}
|
||||
|
||||
app := &App{cat: cat, registry: proxy.NewRegistry()}
|
||||
err = app.attachDrive(ctx, d)
|
||||
if err == nil || !strings.Contains(err.Error(), "爬虫管理") {
|
||||
t.Fatalf("attach err = %v, want crawler management guidance", err)
|
||||
}
|
||||
if _, ok := app.registry.Get(d.ID); ok {
|
||||
t.Fatal("legacy spider91 drive should not be registered")
|
||||
}
|
||||
got, err := cat.GetDrive(ctx, d.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("get drive: %v", err)
|
||||
}
|
||||
if got.Status != "error" || !strings.Contains(got.LastError, "爬虫管理") {
|
||||
t.Fatalf("status/error = %q/%q, want deprecated error", got.Status, got.LastError)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1033,7 +1075,6 @@ func TestCleanupDriveVideosForDeleteRemovesRowsAndGeneratedAssetsOnly(t *testing
|
||||
workers: make(map[string]*preview.Worker),
|
||||
thumbWorkers: make(map[string]*preview.ThumbWorker),
|
||||
fingerprintWorkers: make(map[string]*fingerprint.Worker),
|
||||
spider91Crawlers: make(map[string]*spider91.Crawler),
|
||||
}
|
||||
removed, err := app.cleanupDriveVideosForDelete(ctx, "local-main")
|
||||
if err != nil {
|
||||
@@ -1313,7 +1354,6 @@ func TestCleanupDriveVideosForDeleteSpider91RemovesCrawledDirAndOriginRecords(t
|
||||
workers: make(map[string]*preview.Worker),
|
||||
thumbWorkers: make(map[string]*preview.ThumbWorker),
|
||||
fingerprintWorkers: make(map[string]*fingerprint.Worker),
|
||||
spider91Crawlers: make(map[string]*spider91.Crawler),
|
||||
}
|
||||
removed, err := app.cleanupDriveVideosForDelete(ctx, driveID)
|
||||
if err != nil {
|
||||
|
||||
@@ -10,6 +10,8 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -19,6 +21,7 @@ import (
|
||||
"github.com/video-site/backend/internal/auth"
|
||||
"github.com/video-site/backend/internal/catalog"
|
||||
"github.com/video-site/backend/internal/drives/p123"
|
||||
"github.com/video-site/backend/internal/drives/scriptcrawler"
|
||||
)
|
||||
|
||||
type AdminServer struct {
|
||||
@@ -65,6 +68,9 @@ type AdminServer struct {
|
||||
// Spider91 → 115/123/PikPak/OneDrive 上传目标 drive ID 读写
|
||||
GetSpider91UploadDriveID func() string
|
||||
SetSpider91UploadDriveID func(driveID string) error
|
||||
// DefaultSpider91ScriptPath returns the built-in Spider91 crawler script
|
||||
// path for the independent crawler management UI.
|
||||
DefaultSpider91ScriptPath func() string
|
||||
// OnRunNightlyJob 触发一次完整的凌晨流水线(Phase1 扫盘 + Phase2 91 爬虫 +
|
||||
// Phase3 迁移)。立即返回 —— 实际任务在后台跑,admin 在日志或下次状态查询里
|
||||
// 看进度。若流水线正在跑或已排队,Runner 会拒绝重复触发。
|
||||
@@ -116,6 +122,8 @@ type NightlyJobStatus struct {
|
||||
LastFinishedAt string `json:"lastFinishedAt,omitempty"`
|
||||
}
|
||||
|
||||
const maxCrawlerScriptBytes = 2 * 1024 * 1024
|
||||
|
||||
type DeleteVideoResult struct {
|
||||
OK bool `json:"ok"`
|
||||
DeletedSource bool `json:"deletedSource"`
|
||||
@@ -150,6 +158,15 @@ func (a *AdminServer) Register(r chi.Router) {
|
||||
r.Post("/drives/{id}/thumbnails/failed/regenerate", a.handleRegenFailedThumbnails)
|
||||
r.Post("/drives/{id}/fingerprints/failed/regenerate", a.handleRegenFailedFingerprints)
|
||||
|
||||
// 爬虫
|
||||
r.Get("/crawlers", a.handleListCrawlers)
|
||||
r.Post("/crawlers", a.handleUpsertCrawler)
|
||||
r.Post("/crawlers/import-file", a.handleImportCrawlerScriptFile)
|
||||
r.Post("/crawlers/import-url", a.handleImportCrawlerScriptURL)
|
||||
r.Delete("/crawlers/{id}", a.handleDeleteCrawler)
|
||||
r.Post("/crawlers/{id}/run", a.handleRunCrawler)
|
||||
r.Post("/crawlers/{id}/tasks/stop", a.handleStopCrawlerTasks)
|
||||
|
||||
// 视频
|
||||
r.Get("/videos", a.handleAdminListVideos)
|
||||
r.Put("/videos/{id}", a.handleUpdateVideo)
|
||||
@@ -424,6 +441,11 @@ func (a *AdminServer) handleListDrives(w http.ResponseWriter, r *http.Request) {
|
||||
// LastCrawlAt 是 spider91 上次成功爬取的 unix 秒(来自 credentials.last_crawl_at)。
|
||||
// 其它 kind 留 0;前端用它显示"上次抓取: N 小时前"。
|
||||
Spider91Proxy string `json:"spider91Proxy,omitempty"`
|
||||
ScriptCrawlerPythonPath string `json:"scriptCrawlerPythonPath,omitempty"`
|
||||
ScriptCrawlerScriptPath string `json:"scriptCrawlerScriptPath,omitempty"`
|
||||
ScriptCrawlerProxy string `json:"scriptCrawlerProxy,omitempty"`
|
||||
ScriptCrawlerTargetNew string `json:"scriptCrawlerTargetNew,omitempty"`
|
||||
ScriptCrawlerConfigJSON string `json:"scriptCrawlerConfigJson,omitempty"`
|
||||
LastCrawlAt int64 `json:"lastCrawlAt,omitempty"`
|
||||
GoogleDriveUseOnlineAPI *bool `json:"googleDriveUseOnlineAPI,omitempty"`
|
||||
ScanGenerationStatus GenerationStatus `json:"scanGenerationStatus"`
|
||||
@@ -443,6 +465,9 @@ func (a *AdminServer) handleListDrives(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
list := make([]out, 0, len(drives))
|
||||
for _, d := range drives {
|
||||
if isCrawlerDriveKind(d.Kind) {
|
||||
continue
|
||||
}
|
||||
counts := teaserCounts[d.ID]
|
||||
thumbCounts := thumbnailCounts[d.ID]
|
||||
fingerprintCount := fingerprintCounts[d.ID]
|
||||
@@ -488,6 +513,11 @@ func (a *AdminServer) handleListDrives(w http.ResponseWriter, r *http.Request) {
|
||||
TeaserEnabled: d.TeaserEnabled,
|
||||
SkipDirIDs: append([]string{}, d.SkipDirIDs...),
|
||||
Spider91Proxy: spider91ProxyForDrive(d),
|
||||
ScriptCrawlerPythonPath: scriptCrawlerCred(d, "python_path"),
|
||||
ScriptCrawlerScriptPath: scriptCrawlerCred(d, "script_path"),
|
||||
ScriptCrawlerProxy: scriptCrawlerCred(d, "proxy"),
|
||||
ScriptCrawlerTargetNew: scriptCrawlerCred(d, "target_new"),
|
||||
ScriptCrawlerConfigJSON: scriptCrawlerCred(d, "config_json"),
|
||||
LastCrawlAt: lastCrawlAt,
|
||||
GoogleDriveUseOnlineAPI: googleDriveUseOnlineAPIForDrive(d),
|
||||
ScanGenerationStatus: generation.Scan,
|
||||
@@ -543,7 +573,10 @@ func (a *AdminServer) handleUpsertDrive(w http.ResponseWriter, r *http.Request)
|
||||
existing = existingDrive
|
||||
}
|
||||
if body.Kind == "spider91" {
|
||||
credentials, err := mergeSpider91Credentials(existing, body.Credentials)
|
||||
http.Error(w, "91Spider 已不再支持通过网盘添加,请在爬虫管理页面添加爬虫脚本", http.StatusBadRequest)
|
||||
return
|
||||
} else if body.Kind == scriptcrawler.Kind {
|
||||
credentials, err := mergeScriptCrawlerCredentials(existing, body.Credentials)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
@@ -600,6 +633,421 @@ func (a *AdminServer) handleUpsertDrive(w http.ResponseWriter, r *http.Request)
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true})
|
||||
}
|
||||
|
||||
type crawlerDTO struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Kind string `json:"kind"`
|
||||
Builtin string `json:"builtin,omitempty"`
|
||||
Status string `json:"status"`
|
||||
LastError string `json:"lastError,omitempty"`
|
||||
ScriptPath string `json:"scriptPath"`
|
||||
PythonPath string `json:"pythonPath,omitempty"`
|
||||
Proxy string `json:"proxy,omitempty"`
|
||||
TargetNew string `json:"targetNew,omitempty"`
|
||||
ConfigJSON string `json:"configJson,omitempty"`
|
||||
LastCrawlAt int64 `json:"lastCrawlAt,omitempty"`
|
||||
ScanGenerationStatus GenerationStatus `json:"scanGenerationStatus"`
|
||||
ThumbnailGenerationStatus GenerationStatus `json:"thumbnailGenerationStatus"`
|
||||
PreviewGenerationStatus GenerationStatus `json:"previewGenerationStatus"`
|
||||
FingerprintGenerationStatus GenerationStatus `json:"fingerprintGenerationStatus"`
|
||||
ThumbnailReadyCount int `json:"thumbnailReadyCount"`
|
||||
ThumbnailPendingCount int `json:"thumbnailPendingCount"`
|
||||
ThumbnailFailedCount int `json:"thumbnailFailedCount"`
|
||||
TeaserReadyCount int `json:"teaserReadyCount"`
|
||||
TeaserPendingCount int `json:"teaserPendingCount"`
|
||||
TeaserFailedCount int `json:"teaserFailedCount"`
|
||||
FingerprintReadyCount int `json:"fingerprintReadyCount"`
|
||||
FingerprintPendingCount int `json:"fingerprintPendingCount"`
|
||||
FingerprintFailedCount int `json:"fingerprintFailedCount"`
|
||||
}
|
||||
|
||||
type upsertCrawlerReq struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Builtin string `json:"builtin"`
|
||||
ScriptPath string `json:"scriptPath"`
|
||||
PythonPath string `json:"pythonPath"`
|
||||
Proxy string `json:"proxy"`
|
||||
TargetNew string `json:"targetNew"`
|
||||
ConfigJSON string `json:"configJson"`
|
||||
}
|
||||
|
||||
func (a *AdminServer) handleListCrawlers(w http.ResponseWriter, r *http.Request) {
|
||||
all, err := a.Catalog.ListDrives(r.Context())
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err)
|
||||
return
|
||||
}
|
||||
teaserCounts, err := a.Catalog.CountTeasersByDrive(r.Context())
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err)
|
||||
return
|
||||
}
|
||||
thumbnailCounts, err := a.Catalog.CountThumbnailsByDrive(r.Context())
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err)
|
||||
return
|
||||
}
|
||||
fingerprintCounts, err := a.Catalog.CountFingerprintsByDrive(r.Context())
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err)
|
||||
return
|
||||
}
|
||||
generationStatuses := map[string]DriveGenerationStatuses{}
|
||||
if a.GetDriveGenerationStatuses != nil {
|
||||
generationStatuses = a.GetDriveGenerationStatuses()
|
||||
}
|
||||
|
||||
out := []crawlerDTO{}
|
||||
for _, d := range all {
|
||||
if d == nil || !isCrawlerDriveKind(d.Kind) {
|
||||
continue
|
||||
}
|
||||
out = append(out, a.crawlerDTOForDrive(d, teaserCounts[d.ID], thumbnailCounts[d.ID], fingerprintCounts[d.ID], generationStatuses[d.ID]))
|
||||
}
|
||||
writeJSON(w, http.StatusOK, out)
|
||||
}
|
||||
|
||||
func (a *AdminServer) crawlerDTOForDrive(d *catalog.Drive, teaser catalog.DriveTeaserCounts, thumb catalog.DriveThumbnailCounts, fp catalog.DriveFingerprintCounts, generation DriveGenerationStatuses) crawlerDTO {
|
||||
if generation.Scan.State == "" {
|
||||
generation.Scan.State = "idle"
|
||||
}
|
||||
if generation.Thumbnail.State == "" {
|
||||
generation.Thumbnail.State = "idle"
|
||||
}
|
||||
if generation.Preview.State == "" {
|
||||
generation.Preview.State = "idle"
|
||||
}
|
||||
if generation.Fingerprint.State == "" {
|
||||
generation.Fingerprint.State = "idle"
|
||||
}
|
||||
lastCrawlAt := int64(0)
|
||||
if raw := strings.TrimSpace(d.Credentials["last_crawl_at"]); raw != "" {
|
||||
if v, err := strconv.ParseInt(raw, 10, 64); err == nil {
|
||||
lastCrawlAt = v
|
||||
}
|
||||
}
|
||||
return crawlerDTO{
|
||||
ID: d.ID,
|
||||
Name: d.Name,
|
||||
Kind: d.Kind,
|
||||
Builtin: crawlerBuiltinForDrive(d),
|
||||
Status: d.Status,
|
||||
LastError: d.LastError,
|
||||
ScriptPath: strings.TrimSpace(d.Credentials["script_path"]),
|
||||
PythonPath: strings.TrimSpace(d.Credentials["python_path"]),
|
||||
Proxy: strings.TrimSpace(d.Credentials["proxy"]),
|
||||
TargetNew: strings.TrimSpace(d.Credentials["target_new"]),
|
||||
ConfigJSON: strings.TrimSpace(d.Credentials["config_json"]),
|
||||
LastCrawlAt: lastCrawlAt,
|
||||
ScanGenerationStatus: generation.Scan,
|
||||
ThumbnailGenerationStatus: generation.Thumbnail,
|
||||
PreviewGenerationStatus: generation.Preview,
|
||||
FingerprintGenerationStatus: generation.Fingerprint,
|
||||
ThumbnailReadyCount: thumb.Ready,
|
||||
ThumbnailPendingCount: thumb.Pending,
|
||||
ThumbnailFailedCount: thumb.Failed,
|
||||
TeaserReadyCount: teaser.Ready,
|
||||
TeaserPendingCount: teaser.Pending,
|
||||
TeaserFailedCount: teaser.Failed,
|
||||
FingerprintReadyCount: fp.Ready,
|
||||
FingerprintPendingCount: fp.Pending,
|
||||
FingerprintFailedCount: fp.Failed,
|
||||
}
|
||||
}
|
||||
|
||||
func crawlerBuiltinForDrive(d *catalog.Drive) string {
|
||||
if d == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(d.Credentials["builtin"])
|
||||
}
|
||||
|
||||
func (a *AdminServer) handleUpsertCrawler(w http.ResponseWriter, r *http.Request) {
|
||||
var body upsertCrawlerReq
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
writeErr(w, http.StatusBadRequest, err)
|
||||
return
|
||||
}
|
||||
id := strings.TrimSpace(body.ID)
|
||||
name := strings.TrimSpace(body.Name)
|
||||
if id == "" || name == "" {
|
||||
http.Error(w, "id and name are required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
existing, _ := a.Catalog.GetDrive(r.Context(), id)
|
||||
creds := map[string]string{}
|
||||
if existing != nil {
|
||||
for k, v := range existing.Credentials {
|
||||
creds[k] = v
|
||||
}
|
||||
}
|
||||
builtin := strings.TrimSpace(body.Builtin)
|
||||
if builtin != "" {
|
||||
creds["builtin"] = builtin
|
||||
}
|
||||
scriptPath := strings.TrimSpace(body.ScriptPath)
|
||||
if scriptPath == "" && builtin == "spider91" && a.DefaultSpider91ScriptPath != nil {
|
||||
scriptPath = strings.TrimSpace(a.DefaultSpider91ScriptPath())
|
||||
}
|
||||
incoming := map[string]string{
|
||||
"script_path": scriptPath,
|
||||
"python_path": strings.TrimSpace(body.PythonPath),
|
||||
"proxy": strings.TrimSpace(body.Proxy),
|
||||
"target_new": strings.TrimSpace(body.TargetNew),
|
||||
"config_json": strings.TrimSpace(body.ConfigJSON),
|
||||
}
|
||||
for k, v := range incoming {
|
||||
creds[k] = v
|
||||
}
|
||||
merged, err := mergeScriptCrawlerCredentials(existing, creds)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
if builtin != "" {
|
||||
merged["builtin"] = builtin
|
||||
}
|
||||
d := &catalog.Drive{
|
||||
ID: id,
|
||||
Kind: scriptcrawler.Kind,
|
||||
Name: name,
|
||||
RootID: "/",
|
||||
Credentials: merged,
|
||||
Status: "disconnected",
|
||||
TeaserEnabled: true,
|
||||
}
|
||||
if existing != nil {
|
||||
d.TeaserEnabled = existing.TeaserEnabled
|
||||
}
|
||||
if err := a.Catalog.UpsertDrive(r.Context(), d); err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err)
|
||||
return
|
||||
}
|
||||
if a.OnDriveSaved != nil {
|
||||
if err := a.OnDriveSaved(id); err != nil {
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "warning": err.Error()})
|
||||
return
|
||||
}
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true})
|
||||
}
|
||||
|
||||
type importCrawlerScriptURLReq struct {
|
||||
URL string `json:"url"`
|
||||
FileName string `json:"fileName"`
|
||||
}
|
||||
|
||||
func (a *AdminServer) handleImportCrawlerScriptFile(w http.ResponseWriter, r *http.Request) {
|
||||
r.Body = http.MaxBytesReader(w, r.Body, maxCrawlerScriptBytes+1024*1024)
|
||||
if err := r.ParseMultipartForm(maxCrawlerScriptBytes + 1024*1024); err != nil {
|
||||
writeErr(w, http.StatusBadRequest, err)
|
||||
return
|
||||
}
|
||||
file, header, err := r.FormFile("file")
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusBadRequest, errors.New("file is required"))
|
||||
return
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
name := "crawler.py"
|
||||
if header != nil && strings.TrimSpace(header.Filename) != "" {
|
||||
name = header.Filename
|
||||
}
|
||||
scriptPath, err := a.saveCrawlerScript(r.Context(), name, file, maxCrawlerScriptBytes)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusBadRequest, err)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath})
|
||||
}
|
||||
|
||||
func (a *AdminServer) handleImportCrawlerScriptURL(w http.ResponseWriter, r *http.Request) {
|
||||
var body importCrawlerScriptURLReq
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
writeErr(w, http.StatusBadRequest, err)
|
||||
return
|
||||
}
|
||||
rawURL := strings.TrimSpace(body.URL)
|
||||
u, err := url.Parse(rawURL)
|
||||
if err != nil || u.Scheme == "" || u.Host == "" {
|
||||
writeErr(w, http.StatusBadRequest, errors.New("脚本链接格式无效"))
|
||||
return
|
||||
}
|
||||
if u.Scheme != "http" && u.Scheme != "https" {
|
||||
writeErr(w, http.StatusBadRequest, errors.New("脚本链接仅支持 http:// 或 https://"))
|
||||
return
|
||||
}
|
||||
|
||||
client := a.HTTPClient
|
||||
if client == nil {
|
||||
client = &http.Client{Timeout: 30 * time.Second}
|
||||
}
|
||||
req, err := http.NewRequestWithContext(r.Context(), http.MethodGet, u.String(), nil)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusBadRequest, err)
|
||||
return
|
||||
}
|
||||
req.Header.Set("User-Agent", "video-site-crawler-import/1.0")
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusBadGateway, err)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
writeErr(w, http.StatusBadGateway, fmt.Errorf("下载脚本失败: HTTP %d", resp.StatusCode))
|
||||
return
|
||||
}
|
||||
if resp.ContentLength > maxCrawlerScriptBytes {
|
||||
writeErr(w, http.StatusBadRequest, fmt.Errorf("脚本文件不能超过 %d KiB", maxCrawlerScriptBytes/1024))
|
||||
return
|
||||
}
|
||||
|
||||
name := strings.TrimSpace(body.FileName)
|
||||
if name == "" {
|
||||
name = path.Base(u.Path)
|
||||
}
|
||||
if name == "." || name == "/" || name == "" {
|
||||
name = "crawler.py"
|
||||
}
|
||||
scriptPath, err := a.saveCrawlerScript(r.Context(), name, resp.Body, maxCrawlerScriptBytes)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusBadRequest, err)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"scriptPath": scriptPath})
|
||||
}
|
||||
|
||||
func (a *AdminServer) saveCrawlerScript(ctx context.Context, name string, r io.Reader, maxBytes int64) (string, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
fileName, err := safeCrawlerScriptFileName(name)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
root, err := a.crawlerScriptImportDir()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := os.MkdirAll(root, 0o755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
dst := filepath.Join(root, time.Now().UTC().Format("20060102T150405.000000000Z")+"-"+fileName)
|
||||
dstAbs, err := filepath.Abs(dst)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
rootAbs, err := filepath.Abs(root)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if dstAbs != rootAbs && !strings.HasPrefix(dstAbs, rootAbs+string(os.PathSeparator)) {
|
||||
return "", errors.New("invalid crawler script path")
|
||||
}
|
||||
|
||||
tmp := dstAbs + ".part"
|
||||
out, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o644)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
limited := io.LimitReader(r, maxBytes+1)
|
||||
written, copyErr := io.Copy(out, limited)
|
||||
closeErr := out.Close()
|
||||
if copyErr != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return "", copyErr
|
||||
}
|
||||
if closeErr != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return "", closeErr
|
||||
}
|
||||
if written <= 0 {
|
||||
_ = os.Remove(tmp)
|
||||
return "", errors.New("脚本文件为空")
|
||||
}
|
||||
if written > maxBytes {
|
||||
_ = os.Remove(tmp)
|
||||
return "", fmt.Errorf("脚本文件不能超过 %d KiB", maxBytes/1024)
|
||||
}
|
||||
if err := os.Rename(tmp, dstAbs); err != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return "", err
|
||||
}
|
||||
return dstAbs, nil
|
||||
}
|
||||
|
||||
func (a *AdminServer) crawlerScriptImportDir() (string, error) {
|
||||
base := strings.TrimSpace(a.LocalPreviewDir)
|
||||
if base == "" {
|
||||
base = filepath.Join(".", "data", "previews")
|
||||
}
|
||||
root := filepath.Join(filepath.Dir(base), "crawler-scripts")
|
||||
return filepath.Abs(root)
|
||||
}
|
||||
|
||||
func safeCrawlerScriptFileName(raw string) (string, error) {
|
||||
name := strings.TrimSpace(filepath.Base(raw))
|
||||
if name == "" || name == "." || name == string(os.PathSeparator) {
|
||||
name = "crawler.py"
|
||||
}
|
||||
ext := strings.ToLower(filepath.Ext(name))
|
||||
if ext != ".py" {
|
||||
return "", errors.New("目前只支持导入 .py 爬虫脚本")
|
||||
}
|
||||
stem := strings.TrimSuffix(name, filepath.Ext(name))
|
||||
var b strings.Builder
|
||||
for _, r := range stem {
|
||||
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '-' || r == '_' || r == '.' {
|
||||
b.WriteRune(r)
|
||||
} else {
|
||||
b.WriteByte('_')
|
||||
}
|
||||
}
|
||||
cleanStem := strings.Trim(b.String(), "._-")
|
||||
if cleanStem == "" {
|
||||
cleanStem = "crawler"
|
||||
}
|
||||
return cleanStem + ".py", nil
|
||||
}
|
||||
|
||||
func (a *AdminServer) handleRunCrawler(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
status := a.nightlyJobStatus()
|
||||
if status.Running || status.Queued {
|
||||
writeJSON(w, http.StatusAccepted, map[string]any{
|
||||
"ok": true,
|
||||
"accepted": false,
|
||||
"message": fullScanBusyMessage,
|
||||
"status": status,
|
||||
})
|
||||
return
|
||||
}
|
||||
accepted := true
|
||||
if a.OnScanRequested != nil {
|
||||
accepted = a.OnScanRequested(id)
|
||||
}
|
||||
resp := map[string]any{"ok": true, "accepted": accepted}
|
||||
if !accepted {
|
||||
resp["message"] = driveTaskBusyMessage
|
||||
}
|
||||
writeJSON(w, http.StatusAccepted, resp)
|
||||
}
|
||||
|
||||
func (a *AdminServer) handleStopCrawlerTasks(w http.ResponseWriter, r *http.Request) {
|
||||
a.handleStopDriveTasks(w, r)
|
||||
}
|
||||
|
||||
func (a *AdminServer) handleDeleteCrawler(w http.ResponseWriter, r *http.Request) {
|
||||
a.handleDeleteDrive(w, r)
|
||||
}
|
||||
|
||||
func isCrawlerDriveKind(kind string) bool {
|
||||
return kind == scriptcrawler.Kind
|
||||
}
|
||||
|
||||
func spider91ProxyForDrive(d *catalog.Drive) string {
|
||||
if d == nil || d.Kind != "spider91" || d.Credentials == nil {
|
||||
return ""
|
||||
@@ -607,6 +1055,13 @@ func spider91ProxyForDrive(d *catalog.Drive) string {
|
||||
return strings.TrimSpace(d.Credentials["proxy"])
|
||||
}
|
||||
|
||||
func scriptCrawlerCred(d *catalog.Drive, key string) string {
|
||||
if d == nil || d.Kind != scriptcrawler.Kind || d.Credentials == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(d.Credentials[key])
|
||||
}
|
||||
|
||||
func googleDriveUseOnlineAPIForDrive(d *catalog.Drive) *bool {
|
||||
if d == nil || d.Kind != "googledrive" {
|
||||
return nil
|
||||
@@ -676,20 +1131,89 @@ func mergeSpider91Credentials(existing *catalog.Drive, incoming map[string]strin
|
||||
return merged, nil
|
||||
}
|
||||
|
||||
func mergeScriptCrawlerCredentials(existing *catalog.Drive, incoming map[string]string) (map[string]string, error) {
|
||||
merged := map[string]string{}
|
||||
if existing != nil {
|
||||
for k, v := range existing.Credentials {
|
||||
merged[k] = v
|
||||
}
|
||||
}
|
||||
for k, v := range incoming {
|
||||
key := strings.TrimSpace(k)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
value := strings.TrimSpace(v)
|
||||
switch key {
|
||||
case "proxy":
|
||||
proxy, err := normalizeCrawlerProxyURL(value, "脚本爬虫")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if proxy == "" {
|
||||
delete(merged, key)
|
||||
} else {
|
||||
merged[key] = proxy
|
||||
}
|
||||
case "target_new":
|
||||
if value == "" {
|
||||
delete(merged, key)
|
||||
continue
|
||||
}
|
||||
n, err := strconv.Atoi(value)
|
||||
if err != nil || n <= 0 {
|
||||
return nil, fmt.Errorf("脚本爬虫 target_new 必须是正整数")
|
||||
}
|
||||
merged[key] = strconv.Itoa(n)
|
||||
case "config_json":
|
||||
if value == "" {
|
||||
delete(merged, key)
|
||||
continue
|
||||
}
|
||||
if !json.Valid([]byte(value)) {
|
||||
return nil, fmt.Errorf("脚本爬虫自定义配置必须是合法 JSON")
|
||||
}
|
||||
merged[key] = value
|
||||
case "python_path", "script_path":
|
||||
if value == "" {
|
||||
if existing == nil || key == "script_path" {
|
||||
delete(merged, key)
|
||||
}
|
||||
continue
|
||||
}
|
||||
merged[key] = value
|
||||
default:
|
||||
if value == "" {
|
||||
delete(merged, key)
|
||||
} else {
|
||||
merged[key] = value
|
||||
}
|
||||
}
|
||||
}
|
||||
if strings.TrimSpace(merged["script_path"]) == "" && !strings.EqualFold(strings.TrimSpace(merged["builtin"]), "spider91") {
|
||||
return nil, fmt.Errorf("脚本爬虫必须填写 script_path")
|
||||
}
|
||||
return merged, nil
|
||||
}
|
||||
|
||||
func normalizeSpider91ProxyURL(raw string) (string, error) {
|
||||
return normalizeCrawlerProxyURL(raw, "91Spider")
|
||||
}
|
||||
|
||||
func normalizeCrawlerProxyURL(raw, label string) (string, error) {
|
||||
proxy := strings.TrimSpace(raw)
|
||||
if proxy == "" {
|
||||
return "", nil
|
||||
}
|
||||
u, err := url.Parse(proxy)
|
||||
if err != nil || u.Scheme == "" || u.Host == "" {
|
||||
return "", fmt.Errorf("91Spider 代理地址格式无效,请填写类似 http://127.0.0.1:7890 的地址")
|
||||
return "", fmt.Errorf("%s 代理地址格式无效,请填写类似 http://127.0.0.1:7890 的地址", label)
|
||||
}
|
||||
switch strings.ToLower(u.Scheme) {
|
||||
case "http", "https", "socks5", "socks5h":
|
||||
return proxy, nil
|
||||
default:
|
||||
return "", fmt.Errorf("91Spider 代理地址仅支持 http://、https://、socks5:// 或 socks5h://")
|
||||
return "", fmt.Errorf("%s 代理地址仅支持 http://、https://、socks5:// 或 socks5h://", label)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
@@ -672,7 +673,7 @@ func TestHandleUpsertGoogleDriveMergesOAuthCredentials(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleUpsertSpider91ProxyPreservesRuntimeCredentials(t *testing.T) {
|
||||
func TestHandleUpsertSpider91DriveIsRejected(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
|
||||
if err != nil {
|
||||
@@ -708,16 +709,19 @@ func TestHandleUpsertSpider91ProxyPreservesRuntimeCredentials(t *testing.T) {
|
||||
}`))
|
||||
rr := httptest.NewRecorder()
|
||||
(&AdminServer{Catalog: cat}).handleUpsertDrive(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
|
||||
if rr.Code != http.StatusBadRequest {
|
||||
t.Fatalf("status = %d, want 400; body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
if !strings.Contains(rr.Body.String(), "爬虫管理") {
|
||||
t.Fatalf("body = %q, want crawler management guidance", rr.Body.String())
|
||||
}
|
||||
|
||||
got, err := cat.GetDrive(ctx, "spider91-main")
|
||||
if err != nil {
|
||||
t.Fatalf("get drive: %v", err)
|
||||
}
|
||||
if got.Credentials["proxy"] != "socks5h://proxy-user:proxy-pass@127.0.0.1:7891" {
|
||||
t.Fatalf("proxy = %q, want trimmed new proxy", got.Credentials["proxy"])
|
||||
if got.Credentials["proxy"] != "http://old-proxy.local:7890" {
|
||||
t.Fatalf("proxy = %q, want unchanged old proxy", got.Credentials["proxy"])
|
||||
}
|
||||
if got.Credentials["last_crawl_at"] != "1800000000" {
|
||||
t.Fatalf("last_crawl_at = %q, want preserved", got.Credentials["last_crawl_at"])
|
||||
@@ -725,59 +729,6 @@ func TestHandleUpsertSpider91ProxyPreservesRuntimeCredentials(t *testing.T) {
|
||||
if got.Credentials["script_path"] == "" {
|
||||
t.Fatalf("script_path should be preserved")
|
||||
}
|
||||
|
||||
req = httptest.NewRequest(http.MethodPost, "/admin/api/drives", strings.NewReader(`{
|
||||
"id": "spider91-main",
|
||||
"kind": "spider91",
|
||||
"name": "91 Spider",
|
||||
"rootId": "/",
|
||||
"credentials": {"proxy": " "}
|
||||
}`))
|
||||
rr = httptest.NewRecorder()
|
||||
(&AdminServer{Catalog: cat}).handleUpsertDrive(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("clear status = %d, body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
|
||||
got, err = cat.GetDrive(ctx, "spider91-main")
|
||||
if err != nil {
|
||||
t.Fatalf("get cleared drive: %v", err)
|
||||
}
|
||||
if _, ok := got.Credentials["proxy"]; ok {
|
||||
t.Fatalf("proxy should be removed after empty save, got %q", got.Credentials["proxy"])
|
||||
}
|
||||
if got.Credentials["last_crawl_at"] != "1800000000" {
|
||||
t.Fatalf("last_crawl_at after clear = %q, want preserved", got.Credentials["last_crawl_at"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleUpsertSpider91RejectsUnsupportedProxyScheme(t *testing.T) {
|
||||
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
|
||||
if err != nil {
|
||||
t.Fatalf("open catalog: %v", err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
if err := cat.Close(); err != nil {
|
||||
t.Fatalf("close catalog: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/admin/api/drives", strings.NewReader(`{
|
||||
"id": "spider91-main",
|
||||
"kind": "spider91",
|
||||
"name": "91 Spider",
|
||||
"rootId": "/",
|
||||
"credentials": {"proxy": "ftp://127.0.0.1:21"}
|
||||
}`))
|
||||
rr := httptest.NewRecorder()
|
||||
(&AdminServer{Catalog: cat}).handleUpsertDrive(rr, req)
|
||||
|
||||
if rr.Code != http.StatusBadRequest {
|
||||
t.Fatalf("status = %d, want 400; body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
if !strings.Contains(rr.Body.String(), "socks5:// 或 socks5h://") {
|
||||
t.Fatalf("body = %q, want supported schemes message", rr.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleDeleteDriveRunsRequestedCleanupBeforeDeletingDrive(t *testing.T) {
|
||||
@@ -890,7 +841,7 @@ func TestHandleDeleteDriveRequiresCleanupConfirmation(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleListDrivesIncludesSpider91Proxy(t *testing.T) {
|
||||
func TestHandleListCrawlersOnlyIncludesCrawlerPageScripts(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
|
||||
if err != nil {
|
||||
@@ -911,6 +862,20 @@ func TestHandleListDrivesIncludesSpider91Proxy(t *testing.T) {
|
||||
Credentials: map[string]string{
|
||||
"last_crawl_at": "1800000000",
|
||||
"proxy": " http://127.0.0.1:7890 ",
|
||||
"script_path": "/opt/video-site-91/91VideoSpider/spider_91porn.py",
|
||||
},
|
||||
Status: "ok",
|
||||
},
|
||||
{
|
||||
ID: "crawler-spider91",
|
||||
Kind: "scriptcrawler",
|
||||
Name: "91 Spider",
|
||||
RootID: "/",
|
||||
Credentials: map[string]string{
|
||||
"builtin": "spider91",
|
||||
"last_crawl_at": "1800000000",
|
||||
"proxy": " http://127.0.0.1:7890 ",
|
||||
"script_path": "/opt/video-site-91/91VideoSpider/spider_91porn.py",
|
||||
},
|
||||
Status: "ok",
|
||||
},
|
||||
@@ -930,39 +895,251 @@ func TestHandleListDrivesIncludesSpider91Proxy(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/admin/api/drives", nil)
|
||||
req := httptest.NewRequest(http.MethodGet, "/admin/api/crawlers", nil)
|
||||
rr := httptest.NewRecorder()
|
||||
(&AdminServer{Catalog: cat}).handleListDrives(rr, req)
|
||||
srv := &AdminServer{Catalog: cat}
|
||||
srv.handleListCrawlers(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
|
||||
var got []struct {
|
||||
ID string `json:"id"`
|
||||
Spider91Proxy string `json:"spider91Proxy"`
|
||||
LastCrawlAt int64 `json:"lastCrawlAt"`
|
||||
ID string `json:"id"`
|
||||
Kind string `json:"kind"`
|
||||
Builtin string `json:"builtin"`
|
||||
Proxy string `json:"proxy"`
|
||||
LastCrawlAt int64 `json:"lastCrawlAt"`
|
||||
}
|
||||
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
byID := map[string]struct {
|
||||
Spider91Proxy string
|
||||
LastCrawlAt int64
|
||||
Kind string
|
||||
Builtin string
|
||||
Proxy string
|
||||
LastCrawlAt int64
|
||||
}{}
|
||||
for _, d := range got {
|
||||
byID[d.ID] = struct {
|
||||
Spider91Proxy string
|
||||
LastCrawlAt int64
|
||||
}{Spider91Proxy: d.Spider91Proxy, LastCrawlAt: d.LastCrawlAt}
|
||||
Kind string
|
||||
Builtin string
|
||||
Proxy string
|
||||
LastCrawlAt int64
|
||||
}{Kind: d.Kind, Builtin: d.Builtin, Proxy: d.Proxy, LastCrawlAt: d.LastCrawlAt}
|
||||
}
|
||||
if byID["spider91-main"].Spider91Proxy != "http://127.0.0.1:7890" {
|
||||
t.Fatalf("spider91 proxy = %q, want trimmed proxy", byID["spider91-main"].Spider91Proxy)
|
||||
if _, ok := byID["spider91-main"]; ok {
|
||||
t.Fatal("legacy spider91 drive should not be returned by crawler list")
|
||||
}
|
||||
if byID["spider91-main"].LastCrawlAt != 1800000000 {
|
||||
t.Fatalf("lastCrawlAt = %d, want 1800000000", byID["spider91-main"].LastCrawlAt)
|
||||
if byID["crawler-spider91"].Kind != "scriptcrawler" || byID["crawler-spider91"].Builtin != "spider91" {
|
||||
t.Fatalf("crawler kind/builtin = %q/%q, want scriptcrawler/spider91", byID["crawler-spider91"].Kind, byID["crawler-spider91"].Builtin)
|
||||
}
|
||||
if byID["onedrive-main"].Spider91Proxy != "" {
|
||||
t.Fatalf("onedrive spider91Proxy = %q, want empty", byID["onedrive-main"].Spider91Proxy)
|
||||
if byID["crawler-spider91"].Proxy != "http://127.0.0.1:7890" {
|
||||
t.Fatalf("crawler proxy = %q, want trimmed proxy", byID["crawler-spider91"].Proxy)
|
||||
}
|
||||
if byID["crawler-spider91"].LastCrawlAt != 1800000000 {
|
||||
t.Fatalf("lastCrawlAt = %d, want 1800000000", byID["crawler-spider91"].LastCrawlAt)
|
||||
}
|
||||
if _, ok := byID["onedrive-main"]; ok {
|
||||
t.Fatal("onedrive should not be returned by crawler list")
|
||||
}
|
||||
|
||||
driveReq := httptest.NewRequest(http.MethodGet, "/admin/api/drives", nil)
|
||||
driveRR := httptest.NewRecorder()
|
||||
srv.handleListDrives(driveRR, driveReq)
|
||||
if driveRR.Code != http.StatusOK {
|
||||
t.Fatalf("drive status = %d, body = %s", driveRR.Code, driveRR.Body.String())
|
||||
}
|
||||
var drives []struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
if err := json.NewDecoder(driveRR.Body).Decode(&drives); err != nil {
|
||||
t.Fatalf("decode drives: %v", err)
|
||||
}
|
||||
driveIDs := map[string]bool{}
|
||||
for _, d := range drives {
|
||||
driveIDs[d.ID] = true
|
||||
}
|
||||
if !driveIDs["spider91-main"] {
|
||||
t.Fatal("legacy spider91 drive should remain visible in drive list for deletion")
|
||||
}
|
||||
if driveIDs["crawler-spider91"] {
|
||||
t.Fatal("scriptcrawler should not be returned by drive list")
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleUpsertCrawlerAllowsBuiltinSpider91WithoutScriptPath(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
cat, err := catalog.Open(t.TempDir() + "/catalog.db")
|
||||
if err != nil {
|
||||
t.Fatalf("open catalog: %v", err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
if err := cat.Close(); err != nil {
|
||||
t.Fatalf("close catalog: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers", strings.NewReader(`{
|
||||
"id": "spider91-main",
|
||||
"name": "91 Spider",
|
||||
"builtin": "spider91",
|
||||
"scriptPath": "",
|
||||
"pythonPath": "python3",
|
||||
"targetNew": "15"
|
||||
}`))
|
||||
rr := httptest.NewRecorder()
|
||||
(&AdminServer{
|
||||
Catalog: cat,
|
||||
DefaultSpider91ScriptPath: func() string {
|
||||
return ""
|
||||
},
|
||||
}).handleUpsertCrawler(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
|
||||
got, err := cat.GetDrive(ctx, "spider91-main")
|
||||
if err != nil {
|
||||
t.Fatalf("get crawler drive: %v", err)
|
||||
}
|
||||
if got.Kind != "scriptcrawler" || got.Credentials["builtin"] != "spider91" {
|
||||
t.Fatalf("kind/builtin = %q/%q, want scriptcrawler/spider91", got.Kind, got.Credentials["builtin"])
|
||||
}
|
||||
if got.Credentials["script_path"] != "" {
|
||||
t.Fatalf("script_path = %q, want empty when default is unavailable", got.Credentials["script_path"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleImportCrawlerScriptFile(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
var body bytes.Buffer
|
||||
mw := multipart.NewWriter(&body)
|
||||
part, err := mw.CreateFormFile("file", "../demo crawler.py")
|
||||
if err != nil {
|
||||
t.Fatalf("create form file: %v", err)
|
||||
}
|
||||
if _, err := part.Write([]byte("print('crawler')\n")); err != nil {
|
||||
t.Fatalf("write part: %v", err)
|
||||
}
|
||||
if err := mw.Close(); err != nil {
|
||||
t.Fatalf("close multipart: %v", err)
|
||||
}
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/import-file", &body)
|
||||
req.Header.Set("Content-Type", mw.FormDataContentType())
|
||||
rr := httptest.NewRecorder()
|
||||
(&AdminServer{LocalPreviewDir: filepath.Join(tmp, "previews")}).handleImportCrawlerScriptFile(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
var got struct {
|
||||
ScriptPath string `json:"scriptPath"`
|
||||
}
|
||||
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
wantRoot := filepath.Join(tmp, "crawler-scripts")
|
||||
if !strings.HasPrefix(got.ScriptPath, wantRoot+string(os.PathSeparator)) {
|
||||
t.Fatalf("script path = %q, want under %q", got.ScriptPath, wantRoot)
|
||||
}
|
||||
if filepath.Ext(got.ScriptPath) != ".py" {
|
||||
t.Fatalf("script path = %q, want .py", got.ScriptPath)
|
||||
}
|
||||
data, err := os.ReadFile(got.ScriptPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read imported script: %v", err)
|
||||
}
|
||||
if string(data) != "print('crawler')\n" {
|
||||
t.Fatalf("script content = %q", string(data))
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleImportCrawlerScriptFileRejectsNonPython(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
var body bytes.Buffer
|
||||
mw := multipart.NewWriter(&body)
|
||||
part, err := mw.CreateFormFile("file", "crawler.txt")
|
||||
if err != nil {
|
||||
t.Fatalf("create form file: %v", err)
|
||||
}
|
||||
if _, err := part.Write([]byte("print('crawler')\n")); err != nil {
|
||||
t.Fatalf("write part: %v", err)
|
||||
}
|
||||
if err := mw.Close(); err != nil {
|
||||
t.Fatalf("close multipart: %v", err)
|
||||
}
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/import-file", &body)
|
||||
req.Header.Set("Content-Type", mw.FormDataContentType())
|
||||
rr := httptest.NewRecorder()
|
||||
(&AdminServer{LocalPreviewDir: filepath.Join(tmp, "previews")}).handleImportCrawlerScriptFile(rr, req)
|
||||
if rr.Code != http.StatusBadRequest {
|
||||
t.Fatalf("status = %d, want 400; body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
if !strings.Contains(rr.Body.String(), ".py") {
|
||||
t.Fatalf("body = %s, want .py error", rr.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleImportCrawlerScriptURL(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/crawler.py" {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
_, _ = w.Write([]byte("# crawler from url\n"))
|
||||
}))
|
||||
defer upstream.Close()
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/import-url", strings.NewReader(`{
|
||||
"url": "`+upstream.URL+`/crawler.py"
|
||||
}`))
|
||||
rr := httptest.NewRecorder()
|
||||
(&AdminServer{LocalPreviewDir: filepath.Join(tmp, "previews")}).handleImportCrawlerScriptURL(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
var got struct {
|
||||
ScriptPath string `json:"scriptPath"`
|
||||
}
|
||||
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
wantRoot := filepath.Join(tmp, "crawler-scripts")
|
||||
if !strings.HasPrefix(got.ScriptPath, wantRoot+string(os.PathSeparator)) {
|
||||
t.Fatalf("script path = %q, want under %q", got.ScriptPath, wantRoot)
|
||||
}
|
||||
data, err := os.ReadFile(got.ScriptPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read imported script: %v", err)
|
||||
}
|
||||
if string(data) != "# crawler from url\n" {
|
||||
t.Fatalf("script content = %q", string(data))
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleImportCrawlerScriptURLRejectsNonPython(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/crawler.txt" {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
_, _ = w.Write([]byte("# crawler from url\n"))
|
||||
}))
|
||||
defer upstream.Close()
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/admin/api/crawlers/import-url", strings.NewReader(`{
|
||||
"url": "`+upstream.URL+`/crawler.txt"
|
||||
}`))
|
||||
rr := httptest.NewRecorder()
|
||||
(&AdminServer{LocalPreviewDir: filepath.Join(tmp, "previews")}).handleImportCrawlerScriptURL(rr, req)
|
||||
if rr.Code != http.StatusBadRequest {
|
||||
t.Fatalf("status = %d, want 400; body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
if !strings.Contains(rr.Body.String(), ".py") {
|
||||
t.Fatalf("body = %s, want .py error", rr.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -706,7 +706,19 @@ func (c *Catalog) ListVideoFileIDsByDrive(ctx context.Context, driveID string) (
|
||||
// 用途:crawler 把这个集合写到 seen 文件,让 Python/Go 跳过已爬过的视频,
|
||||
// 配合 --target-new 真正凑出 N 个未爬过的视频。
|
||||
func (c *Catalog) ListSpider91Viewkeys(ctx context.Context, driveID string) ([]string, error) {
|
||||
prefix := "spider91-" + driveID + "-"
|
||||
return c.ListCrawlerSourceIDs(ctx, "spider91", driveID)
|
||||
}
|
||||
|
||||
// ListCrawlerSourceIDs lists source IDs that were already imported by a
|
||||
// crawler-like drive. It reads both videos and deleted_videos so explicit admin
|
||||
// deletions remain tombstoned for future crawler runs.
|
||||
func (c *Catalog) ListCrawlerSourceIDs(ctx context.Context, kind, driveID string) ([]string, error) {
|
||||
kind = strings.TrimSpace(kind)
|
||||
driveID = strings.TrimSpace(driveID)
|
||||
if kind == "" || driveID == "" {
|
||||
return nil, nil
|
||||
}
|
||||
prefix := kind + "-" + driveID + "-"
|
||||
rows, err := c.db.QueryContext(ctx,
|
||||
`SELECT SUBSTR(id, ?) FROM videos WHERE id LIKE ? || '%'
|
||||
UNION
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,268 @@
|
||||
package scriptcrawler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/video-site/backend/internal/catalog"
|
||||
)
|
||||
|
||||
func TestCrawlerRunOnceImportsLocalFileAndSkipsExisting(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
tmp := t.TempDir()
|
||||
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open catalog: %v", err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
if err := cat.Close(); err != nil {
|
||||
t.Fatalf("close catalog: %v", err)
|
||||
}
|
||||
})
|
||||
drv := New(Config{ID: "demo", RootDir: filepath.Join(tmp, "crawler")})
|
||||
if err := drv.Init(ctx); err != nil {
|
||||
t.Fatalf("driver init: %v", err)
|
||||
}
|
||||
dummyScript := filepath.Join(tmp, "helper-script")
|
||||
if err := os.WriteFile(dummyScript, []byte("helper"), 0o755); err != nil {
|
||||
t.Fatalf("write dummy script: %v", err)
|
||||
}
|
||||
wrapper := filepath.Join(tmp, "helper-wrapper.sh")
|
||||
wrapperScript := fmt.Sprintf("#!/bin/sh\nexec %q -test.run=TestScriptCrawlerHelperProcess \"$@\"\n", os.Args[0])
|
||||
if err := os.WriteFile(wrapper, []byte(wrapperScript), 0o755); err != nil {
|
||||
t.Fatalf("write helper wrapper: %v", err)
|
||||
}
|
||||
|
||||
t.Setenv("GO_WANT_SCRIPTCRAWLER_HELPER", "1")
|
||||
c := NewCrawler(CrawlerConfig{
|
||||
Driver: drv,
|
||||
Catalog: cat,
|
||||
PythonPath: wrapper,
|
||||
ScriptPath: dummyScript,
|
||||
})
|
||||
res, err := c.RunOnce(ctx, 1)
|
||||
if err != nil {
|
||||
t.Fatalf("run once: %v", err)
|
||||
}
|
||||
if res.NewVideos != 1 || res.Skipped != 0 || res.Failed != 0 {
|
||||
t.Fatalf("result = new:%d skipped:%d failed:%d, want 1/0/0", res.NewVideos, res.Skipped, res.Failed)
|
||||
}
|
||||
v, err := cat.GetVideo(ctx, BuildVideoID("demo", "abc-123"))
|
||||
if err != nil {
|
||||
t.Fatalf("get video: %v", err)
|
||||
}
|
||||
if v.Title != "Imported From Helper" || v.FileID != "abc-123.mp4" || v.Size == 0 {
|
||||
t.Fatalf("video = title:%q file:%q size:%d", v.Title, v.FileID, v.Size)
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(drv.VideosDir(), "abc-123.mp4")); err != nil {
|
||||
t.Fatalf("video file not copied: %v", err)
|
||||
}
|
||||
|
||||
res, err = c.RunOnce(ctx, 1)
|
||||
if err != nil {
|
||||
t.Fatalf("second run: %v", err)
|
||||
}
|
||||
if res.NewVideos != 0 || res.Skipped != 1 {
|
||||
t.Fatalf("second result = new:%d skipped:%d, want 0/1", res.NewVideos, res.Skipped)
|
||||
}
|
||||
if res.SeenSnapshot != 1 {
|
||||
t.Fatalf("seen snapshot = %d, want 1", res.SeenSnapshot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawlerRunOnceUsesSourceKindNamespace(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
tmp := t.TempDir()
|
||||
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open catalog: %v", err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
if err := cat.Close(); err != nil {
|
||||
t.Fatalf("close catalog: %v", err)
|
||||
}
|
||||
})
|
||||
drv := New(Config{ID: "demo", RootDir: filepath.Join(tmp, "crawler")})
|
||||
if err := drv.Init(ctx); err != nil {
|
||||
t.Fatalf("driver init: %v", err)
|
||||
}
|
||||
dummyScript := filepath.Join(tmp, "helper-script")
|
||||
if err := os.WriteFile(dummyScript, []byte("helper"), 0o755); err != nil {
|
||||
t.Fatalf("write dummy script: %v", err)
|
||||
}
|
||||
wrapper := filepath.Join(tmp, "helper-wrapper.sh")
|
||||
wrapperScript := fmt.Sprintf("#!/bin/sh\nexec %q -test.run=TestScriptCrawlerHelperProcess \"$@\"\n", os.Args[0])
|
||||
if err := os.WriteFile(wrapper, []byte(wrapperScript), 0o755); err != nil {
|
||||
t.Fatalf("write helper wrapper: %v", err)
|
||||
}
|
||||
|
||||
t.Setenv("GO_WANT_SCRIPTCRAWLER_HELPER", "1")
|
||||
c := NewCrawler(CrawlerConfig{
|
||||
Driver: drv,
|
||||
Catalog: cat,
|
||||
SourceKind: "spider91",
|
||||
PythonPath: wrapper,
|
||||
ScriptPath: dummyScript,
|
||||
})
|
||||
res, err := c.RunOnce(ctx, 1)
|
||||
if err != nil {
|
||||
t.Fatalf("run once: %v", err)
|
||||
}
|
||||
if res.NewVideos != 1 || res.SeenSnapshot != 0 {
|
||||
t.Fatalf("result = new:%d seen:%d, want 1/0", res.NewVideos, res.SeenSnapshot)
|
||||
}
|
||||
videoID := BuildVideoIDForKind("spider91", "demo", "abc-123")
|
||||
if _, err := cat.GetVideo(ctx, videoID); err != nil {
|
||||
t.Fatalf("get source-kind video: %v", err)
|
||||
}
|
||||
if _, err := cat.GetVideo(ctx, BuildVideoID("demo", "abc-123")); err == nil {
|
||||
t.Fatalf("default namespace video unexpectedly exists")
|
||||
}
|
||||
|
||||
res, err = c.RunOnce(ctx, 1)
|
||||
if err != nil {
|
||||
t.Fatalf("second run: %v", err)
|
||||
}
|
||||
if res.NewVideos != 0 || res.Skipped != 1 || res.SeenSnapshot != 1 {
|
||||
t.Fatalf("second result = new:%d skipped:%d seen:%d, want 0/1/1", res.NewVideos, res.Skipped, res.SeenSnapshot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawlerRunOnceImportsSimpleMediaURLWithoutSourceID(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
tmp := t.TempDir()
|
||||
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open catalog: %v", err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
if err := cat.Close(); err != nil {
|
||||
t.Fatalf("close catalog: %v", err)
|
||||
}
|
||||
})
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/video.mp4" {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
_, _ = w.Write([]byte("simple-video-bytes"))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
drv := New(Config{ID: "demo", RootDir: filepath.Join(tmp, "crawler")})
|
||||
if err := drv.Init(ctx); err != nil {
|
||||
t.Fatalf("driver init: %v", err)
|
||||
}
|
||||
dummyScript := filepath.Join(tmp, "helper-script")
|
||||
if err := os.WriteFile(dummyScript, []byte("helper"), 0o755); err != nil {
|
||||
t.Fatalf("write dummy script: %v", err)
|
||||
}
|
||||
wrapper := filepath.Join(tmp, "helper-wrapper.sh")
|
||||
wrapperScript := fmt.Sprintf("#!/bin/sh\nexec %q -test.run=TestScriptCrawlerHelperProcess \"$@\"\n", os.Args[0])
|
||||
if err := os.WriteFile(wrapper, []byte(wrapperScript), 0o755); err != nil {
|
||||
t.Fatalf("write helper wrapper: %v", err)
|
||||
}
|
||||
|
||||
t.Setenv("GO_WANT_SCRIPTCRAWLER_HELPER", "1")
|
||||
t.Setenv("GO_WANT_SCRIPTCRAWLER_SIMPLE", "1")
|
||||
t.Setenv("GO_SCRIPTCRAWLER_MEDIA_URL", srv.URL+"/video.mp4?token=first")
|
||||
c := NewCrawler(CrawlerConfig{
|
||||
Driver: drv,
|
||||
Catalog: cat,
|
||||
PythonPath: wrapper,
|
||||
ScriptPath: dummyScript,
|
||||
HTTPClient: srv.Client(),
|
||||
})
|
||||
res, err := c.RunOnce(ctx, 1)
|
||||
if err != nil {
|
||||
t.Fatalf("run once: %v", err)
|
||||
}
|
||||
if res.NewVideos != 1 || res.Skipped != 0 || res.Failed != 0 {
|
||||
t.Fatalf("result = new:%d skipped:%d failed:%d, want 1/0/0", res.NewVideos, res.Skipped, res.Failed)
|
||||
}
|
||||
videos, err := cat.ListVideosByDrive(ctx, "demo")
|
||||
if err != nil {
|
||||
t.Fatalf("list videos: %v", err)
|
||||
}
|
||||
if len(videos) != 1 {
|
||||
t.Fatalf("videos = %d, want 1", len(videos))
|
||||
}
|
||||
v := videos[0]
|
||||
if !strings.HasPrefix(v.ID, BuildVideoID("demo", "auto-")) {
|
||||
t.Fatalf("video id = %q, want generated auto source id", v.ID)
|
||||
}
|
||||
if v.Title != "Simple Protocol Video" || v.Ext != "mp4" || v.ThumbnailURL != "" || v.Size == 0 {
|
||||
t.Fatalf("video = title:%q ext:%q thumb:%q size:%d", v.Title, v.Ext, v.ThumbnailURL, v.Size)
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(drv.VideosDir(), v.FileID)); err != nil {
|
||||
t.Fatalf("video file not downloaded: %v", err)
|
||||
}
|
||||
|
||||
t.Setenv("GO_SCRIPTCRAWLER_MEDIA_URL", srv.URL+"/video.mp4?token=second")
|
||||
res, err = c.RunOnce(ctx, 1)
|
||||
if err != nil {
|
||||
t.Fatalf("second run: %v", err)
|
||||
}
|
||||
if res.NewVideos != 0 || res.Skipped != 1 {
|
||||
t.Fatalf("second result = new:%d skipped:%d, want 0/1", res.NewVideos, res.Skipped)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScriptCrawlerHelperProcess(t *testing.T) {
|
||||
if os.Getenv("GO_WANT_SCRIPTCRAWLER_HELPER") != "1" {
|
||||
return
|
||||
}
|
||||
args := os.Args
|
||||
jobPath := ""
|
||||
for i := 0; i < len(args)-1; i++ {
|
||||
if args[i] == "--job" {
|
||||
jobPath = args[i+1]
|
||||
break
|
||||
}
|
||||
}
|
||||
if jobPath == "" {
|
||||
fmt.Fprintln(os.Stderr, "missing --job")
|
||||
os.Exit(2)
|
||||
}
|
||||
data, err := os.ReadFile(jobPath)
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
os.Exit(2)
|
||||
}
|
||||
var job Job
|
||||
if err := json.Unmarshal(data, &job); err != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
os.Exit(2)
|
||||
}
|
||||
if os.Getenv("GO_WANT_SCRIPTCRAWLER_SIMPLE") == "1" {
|
||||
event := map[string]any{
|
||||
"title": "Simple Protocol Video",
|
||||
"media_url": os.Getenv("GO_SCRIPTCRAWLER_MEDIA_URL"),
|
||||
}
|
||||
_ = json.NewEncoder(os.Stdout).Encode(event)
|
||||
os.Exit(0)
|
||||
}
|
||||
localFile := filepath.Join(job.OutputDir, "helper.mp4")
|
||||
if err := os.WriteFile(localFile, []byte("helper-video"), 0o644); err != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
os.Exit(2)
|
||||
}
|
||||
event := Event{
|
||||
Type: "item",
|
||||
Item: Item{
|
||||
SourceID: "abc-123",
|
||||
Title: "Imported From Helper",
|
||||
Author: "helper",
|
||||
Media: MediaRef{LocalFile: localFile},
|
||||
},
|
||||
}
|
||||
_ = json.NewEncoder(os.Stdout).Encode(event)
|
||||
os.Exit(0)
|
||||
}
|
||||
@@ -0,0 +1,172 @@
|
||||
// Package scriptcrawler provides a generic local drive for script-based
|
||||
// crawlers. A crawler script discovers videos; the Go runner downloads them
|
||||
// into this drive and the existing preview/fingerprint workers consume them
|
||||
// through the normal drives.Drive interface.
|
||||
package scriptcrawler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/video-site/backend/internal/drives"
|
||||
)
|
||||
|
||||
const Kind = "scriptcrawler"
|
||||
|
||||
type Config struct {
|
||||
ID string
|
||||
RootDir string
|
||||
}
|
||||
|
||||
type Driver struct {
|
||||
id string
|
||||
rootDir string
|
||||
}
|
||||
|
||||
func New(c Config) *Driver {
|
||||
return &Driver{id: c.ID, rootDir: c.RootDir}
|
||||
}
|
||||
|
||||
func (d *Driver) Kind() string { return Kind }
|
||||
|
||||
func (d *Driver) ID() string { return d.id }
|
||||
|
||||
func (d *Driver) RootID() string { return "/" }
|
||||
|
||||
func (d *Driver) Init(context.Context) error {
|
||||
if strings.TrimSpace(d.id) == "" {
|
||||
return errors.New("scriptcrawler: empty drive id")
|
||||
}
|
||||
if strings.TrimSpace(d.rootDir) == "" {
|
||||
return errors.New("scriptcrawler: empty root dir")
|
||||
}
|
||||
for _, sub := range []string{"videos", "thumbs", "output", ".crawl"} {
|
||||
if err := os.MkdirAll(filepath.Join(d.rootDir, sub), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (d *Driver) RootDir() string { return d.rootDir }
|
||||
|
||||
func (d *Driver) VideosDir() string { return filepath.Join(d.rootDir, "videos") }
|
||||
|
||||
func (d *Driver) ThumbsDir() string { return filepath.Join(d.rootDir, "thumbs") }
|
||||
|
||||
func (d *Driver) OutputDir() string { return filepath.Join(d.rootDir, "output") }
|
||||
|
||||
func (d *Driver) CrawlDir() string { return filepath.Join(d.rootDir, ".crawl") }
|
||||
|
||||
func (d *Driver) VideoPath(fileID string) (string, error) {
|
||||
return safeJoin(d.VideosDir(), fileID)
|
||||
}
|
||||
|
||||
func (d *Driver) ThumbPath(fileID string) (string, error) {
|
||||
return safeJoin(d.ThumbsDir(), fileID)
|
||||
}
|
||||
|
||||
func (d *Driver) OutputPath(fileName string) (string, error) {
|
||||
return safeJoin(d.OutputDir(), fileName)
|
||||
}
|
||||
|
||||
func (d *Driver) List(context.Context, string) ([]drives.Entry, error) {
|
||||
entries, err := os.ReadDir(d.VideosDir())
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
out := make([]drives.Entry, 0, len(entries))
|
||||
for _, e := range entries {
|
||||
if e.IsDir() {
|
||||
continue
|
||||
}
|
||||
info, err := e.Info()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out = append(out, drives.Entry{
|
||||
ID: e.Name(),
|
||||
Name: e.Name(),
|
||||
Size: info.Size(),
|
||||
IsDir: false,
|
||||
ModTime: info.ModTime(),
|
||||
})
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (d *Driver) Stat(ctx context.Context, fileID string) (*drives.Entry, error) {
|
||||
path, err := d.VideoPath(fileID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &drives.Entry{
|
||||
ID: fileID,
|
||||
Name: fileID,
|
||||
Size: info.Size(),
|
||||
IsDir: info.IsDir(),
|
||||
ModTime: info.ModTime(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (d *Driver) StreamURL(ctx context.Context, fileID string) (*drives.StreamLink, error) {
|
||||
path, err := d.VideoPath(fileID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if info.IsDir() || info.Size() == 0 {
|
||||
return nil, os.ErrNotExist
|
||||
}
|
||||
return &drives.StreamLink{
|
||||
URL: path,
|
||||
Expires: time.Now().Add(24 * time.Hour),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (d *Driver) Upload(context.Context, string, string, io.Reader, int64) (string, error) {
|
||||
return "", drives.ErrNotSupported
|
||||
}
|
||||
|
||||
func (d *Driver) EnsureDir(context.Context, string) (string, error) {
|
||||
return "", drives.ErrNotSupported
|
||||
}
|
||||
|
||||
func safeJoin(root, fileID string) (string, error) {
|
||||
id := strings.TrimSpace(fileID)
|
||||
if id == "" || filepath.Base(id) != id {
|
||||
return "", errors.New("scriptcrawler: invalid file id")
|
||||
}
|
||||
if strings.TrimSpace(root) == "" {
|
||||
return "", errors.New("scriptcrawler: empty root")
|
||||
}
|
||||
rootAbs, err := filepath.Abs(root)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
pathAbs, err := filepath.Abs(filepath.Join(rootAbs, id))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if pathAbs != rootAbs && !strings.HasPrefix(pathAbs, rootAbs+string(os.PathSeparator)) {
|
||||
return "", errors.New("scriptcrawler: file id escapes root")
|
||||
}
|
||||
return pathAbs, nil
|
||||
}
|
||||
|
||||
var _ drives.Drive = (*Driver)(nil)
|
||||
@@ -34,6 +34,7 @@ import (
|
||||
"github.com/video-site/backend/internal/drives/p115"
|
||||
"github.com/video-site/backend/internal/drives/p123"
|
||||
"github.com/video-site/backend/internal/drives/pikpak"
|
||||
"github.com/video-site/backend/internal/drives/scriptcrawler"
|
||||
"github.com/video-site/backend/internal/drives/spider91"
|
||||
"github.com/video-site/backend/internal/mediaasset"
|
||||
)
|
||||
@@ -58,6 +59,17 @@ type uploadTarget interface {
|
||||
Rename(ctx context.Context, fileID, newName string) error
|
||||
}
|
||||
|
||||
// Spider91LocalSource is the local source interface used by the migration
|
||||
// worker. Legacy spider91.Driver and the new scriptcrawler.Driver both satisfy
|
||||
// it when they are mounted for the Spider91 built-in crawler.
|
||||
type Spider91LocalSource interface {
|
||||
drives.Drive
|
||||
VideosDir() string
|
||||
ThumbsDir() string
|
||||
VideoPath(fileID string) (string, error)
|
||||
ThumbPath(fileID string) (string, error)
|
||||
}
|
||||
|
||||
// UploadResult 是 uploadTarget.UploadAndReportHash 的归一返回。
|
||||
//
|
||||
// FileID 目标盘上的新文件 ID;
|
||||
@@ -364,7 +376,7 @@ func (m *Migrator) runOnce(ctx context.Context) {
|
||||
}
|
||||
|
||||
migrated := 0
|
||||
for _, src := range m.spider91Drives() {
|
||||
for _, src := range m.spider91Drives(ctx) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
@@ -387,7 +399,7 @@ func (m *Migrator) runOnce(ctx context.Context) {
|
||||
// 收尾:扫每个 spider91 drive 的本地目录,把 catalog 已经迁到别处但本地
|
||||
// 仍有残留的孤儿文件清掉。这是纯防御性兜底——正常路径下 migrateDrive
|
||||
// 已经在迁移成功后立刻 CleanupSpider91Local,不会留孤儿。
|
||||
for _, src := range m.spider91Drives() {
|
||||
for _, src := range m.spider91Drives(ctx) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
@@ -448,21 +460,41 @@ func (m *Migrator) resolveTarget() (string, uploadTarget, error) {
|
||||
return id, t, nil
|
||||
}
|
||||
|
||||
// spider91Drives 返回当前注册的所有 spider91 driver。
|
||||
func (m *Migrator) spider91Drives() []*spider91.Driver {
|
||||
// spider91Drives 返回当前注册的所有 Spider91 来源本地爬虫 driver。
|
||||
func (m *Migrator) spider91Drives(ctx context.Context) []Spider91LocalSource {
|
||||
all := m.cfg.Registry.All()
|
||||
out := make([]*spider91.Driver, 0, len(all))
|
||||
out := make([]Spider91LocalSource, 0, len(all))
|
||||
for _, d := range all {
|
||||
if d.Kind() != spider91.Kind {
|
||||
if !m.isSpider91SourceDrive(ctx, d) {
|
||||
continue
|
||||
}
|
||||
if sd, ok := d.(*spider91.Driver); ok {
|
||||
if sd, ok := d.(Spider91LocalSource); ok {
|
||||
out = append(out, sd)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (m *Migrator) isSpider91SourceDrive(ctx context.Context, d drives.Drive) bool {
|
||||
if d == nil {
|
||||
return false
|
||||
}
|
||||
if d.Kind() == spider91.Kind {
|
||||
return true
|
||||
}
|
||||
if d.Kind() != scriptcrawler.Kind || m.cfg.Catalog == nil {
|
||||
return false
|
||||
}
|
||||
row, err := m.cfg.Catalog.GetDrive(ctx, d.ID())
|
||||
if err != nil || row == nil {
|
||||
return false
|
||||
}
|
||||
if row.Kind == spider91.Kind {
|
||||
return true
|
||||
}
|
||||
return row.Kind == scriptcrawler.Kind && strings.EqualFold(strings.TrimSpace(row.Credentials["builtin"]), spider91.Kind)
|
||||
}
|
||||
|
||||
// migrateDrive 对单个 spider91 drive 跑一批迁移;返回成功迁移的条数。
|
||||
//
|
||||
// 策略(与"本地缓存最新 N 个"语义一致):
|
||||
@@ -473,7 +505,7 @@ func (m *Migrator) spider91Drives() []*spider91.Driver {
|
||||
// - 已经迁移过但本地还有残留 → 仅删本地(兜底)
|
||||
//
|
||||
// KeepLatestN < 0 时不保护任何本地文件,全部尝试迁移(旧行为,主要给测试用)。
|
||||
func (m *Migrator) migrateDrive(ctx context.Context, src *spider91.Driver, targetDriveID string, pp uploadTarget) (int, error) {
|
||||
func (m *Migrator) migrateDrive(ctx context.Context, src Spider91LocalSource, targetDriveID string, pp uploadTarget) (int, error) {
|
||||
keepN := m.cfg.KeepLatestN
|
||||
if keepN < 0 {
|
||||
keepN = 0
|
||||
@@ -574,7 +606,7 @@ func (m *Migrator) migrateDrive(ctx context.Context, src *spider91.Driver, targe
|
||||
// migrateOne 把单条 spider91 视频上传到目标盘并改写 catalog。
|
||||
// 返回 (true, nil) 表示真的迁了一条;(false, nil) 表示跳过(本地文件已不在等);
|
||||
// (false, err) 表示真出错。
|
||||
func (m *Migrator) migrateOne(ctx context.Context, v *catalog.Video, src *spider91.Driver, targetDriveID string, pp uploadTarget) (bool, error) {
|
||||
func (m *Migrator) migrateOne(ctx context.Context, v *catalog.Video, src Spider91LocalSource, targetDriveID string, pp uploadTarget) (bool, error) {
|
||||
path, err := src.VideoPath(v.FileID)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("resolve local path: %w", err)
|
||||
@@ -637,7 +669,7 @@ func (m *Migrator) migrateOne(ctx context.Context, v *catalog.Video, src *spider
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func (m *Migrator) preserveCrawledThumbnail(ctx context.Context, src *spider91.Driver, v *catalog.Video) {
|
||||
func (m *Migrator) preserveCrawledThumbnail(ctx context.Context, src Spider91LocalSource, v *catalog.Video) {
|
||||
if m == nil || m.cfg.Catalog == nil || src == nil || v == nil || v.ID == "" || v.FileID == "" {
|
||||
return
|
||||
}
|
||||
@@ -676,7 +708,7 @@ func (m *Migrator) preserveCrawledThumbnail(ctx context.Context, src *spider91.D
|
||||
v.ThumbnailURL = "/p/thumb/" + v.ID
|
||||
}
|
||||
|
||||
func findSpider91ThumbPath(src *spider91.Driver, fileID string) (string, bool) {
|
||||
func findSpider91ThumbPath(src Spider91LocalSource, fileID string) (string, bool) {
|
||||
thumbBase := stripExt(fileID)
|
||||
for _, ext := range []string{".jpg", ".jpeg", ".png", ".webp"} {
|
||||
thumbPath, err := src.ThumbPath(thumbBase + ext)
|
||||
@@ -722,7 +754,7 @@ func copyFileAtomic(src, dst string) error {
|
||||
// 我们不知道具体是 .jpg 还是别的,逐个尝试常见后缀)。
|
||||
//
|
||||
// 暴露成包级函数方便 cleanup 模块复用(任务 6)。
|
||||
func CleanupSpider91Local(src *spider91.Driver, fileID string) {
|
||||
func CleanupSpider91Local(src Spider91LocalSource, fileID string) {
|
||||
videoPath, err := src.VideoPath(fileID)
|
||||
if err == nil {
|
||||
if err := os.Remove(videoPath); err != nil && !os.IsNotExist(err) {
|
||||
@@ -759,7 +791,7 @@ func stripExt(name string) string {
|
||||
// 找到孤儿。
|
||||
//
|
||||
// 返回实际删除的文件个数。
|
||||
func (m *Migrator) cleanupOldLocalVideos(ctx context.Context, src *spider91.Driver) (int, error) {
|
||||
func (m *Migrator) cleanupOldLocalVideos(ctx context.Context, src Spider91LocalSource) (int, error) {
|
||||
entries, err := os.ReadDir(src.VideosDir())
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
|
||||
@@ -17,6 +17,7 @@ import (
|
||||
"github.com/video-site/backend/internal/drives/googledrive"
|
||||
"github.com/video-site/backend/internal/drives/p123"
|
||||
"github.com/video-site/backend/internal/drives/pikpak"
|
||||
"github.com/video-site/backend/internal/drives/scriptcrawler"
|
||||
"github.com/video-site/backend/internal/drives/spider91"
|
||||
)
|
||||
|
||||
@@ -599,6 +600,88 @@ func TestCleanupRemovesAllAlreadyMigratedOrphans(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunOnceMigratesBuiltInSpider91ScriptCrawlerSource(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
cat := setupCatalog(t)
|
||||
src := scriptcrawler.New(scriptcrawler.Config{ID: "spider-script", RootDir: t.TempDir()})
|
||||
if err := src.Init(ctx); err != nil {
|
||||
t.Fatalf("scriptcrawler init: %v", err)
|
||||
}
|
||||
if err := cat.UpsertDrive(ctx, &catalog.Drive{
|
||||
ID: src.ID(),
|
||||
Kind: scriptcrawler.Kind,
|
||||
Name: "Built-in Spider91",
|
||||
Credentials: map[string]string{"builtin": "spider91"},
|
||||
}); err != nil {
|
||||
t.Fatalf("upsert source drive: %v", err)
|
||||
}
|
||||
pp := newFakePikPak("pikpak-target", "pikpak-root-id")
|
||||
reg := newFakeRegistry()
|
||||
reg.Add(src)
|
||||
reg.Add(pp)
|
||||
|
||||
fileID := "vk-script.mp4"
|
||||
videoPath, err := src.VideoPath(fileID)
|
||||
if err != nil {
|
||||
t.Fatalf("video path: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(videoPath, []byte("scriptcrawler spider91 video"), 0o644); err != nil {
|
||||
t.Fatalf("write video: %v", err)
|
||||
}
|
||||
thumbPath, err := src.ThumbPath("vk-script.jpg")
|
||||
if err != nil {
|
||||
t.Fatalf("thumb path: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(thumbPath, []byte("thumb"), 0o644); err != nil {
|
||||
t.Fatalf("write thumb: %v", err)
|
||||
}
|
||||
now := time.Now()
|
||||
id := "spider91-" + src.ID() + "-vk-script"
|
||||
if err := cat.UpsertVideo(ctx, &catalog.Video{
|
||||
ID: id,
|
||||
DriveID: src.ID(),
|
||||
FileID: fileID,
|
||||
FileName: fileID,
|
||||
Title: "Scriptcrawler Spider91",
|
||||
Author: "91porn",
|
||||
Ext: "mp4",
|
||||
Quality: "HD",
|
||||
Size: int64(len("scriptcrawler spider91 video")),
|
||||
PreviewStatus: "pending",
|
||||
PublishedAt: now,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}); err != nil {
|
||||
t.Fatalf("upsert video: %v", err)
|
||||
}
|
||||
|
||||
m := New(Config{
|
||||
Catalog: cat,
|
||||
Registry: reg,
|
||||
GetTargetDriveID: func() string { return pp.ID() },
|
||||
KeepLatestN: -1,
|
||||
CommonThumbDir: t.TempDir(),
|
||||
})
|
||||
m.runOnce(ctx)
|
||||
|
||||
if pp.uploadCalls != 1 {
|
||||
t.Fatalf("upload calls = %d, want 1", pp.uploadCalls)
|
||||
}
|
||||
got, err := cat.GetVideo(ctx, id)
|
||||
if err != nil {
|
||||
t.Fatalf("get migrated video: %v", err)
|
||||
}
|
||||
if got.DriveID != pp.ID() {
|
||||
t.Fatalf("drive_id = %q, want %q", got.DriveID, pp.ID())
|
||||
}
|
||||
if _, err := os.Stat(videoPath); !os.IsNotExist(err) {
|
||||
t.Fatalf("local video stat err = %v, want not exist", err)
|
||||
}
|
||||
if _, err := os.Stat(thumbPath); !os.IsNotExist(err) {
|
||||
t.Fatalf("local thumb stat err = %v, want not exist", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunOnceKeepsAllLocalWhenWithinKeepWindow 验证:本地文件数 ≤ KeepLatestN 时
|
||||
// 一律不上传,全部留作"最新 N"缓存。这是用户的核心需求:刚爬下来的 15 个不要立即被传走。
|
||||
func TestRunOnceKeepsAllLocalWhenWithinKeepWindow(t *testing.T) {
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
# Crawler Script Protocol v1
|
||||
|
||||
Crawler scripts are external processes. The Go backend is the host: it handles
|
||||
dedupe, downloading, catalog writes, thumbnails, preview videos, fingerprints,
|
||||
task status and cancellation.
|
||||
|
||||
## Invocation
|
||||
|
||||
The backend runs:
|
||||
|
||||
```bash
|
||||
python3 /path/to/crawler.py --job /path/to/job.json
|
||||
```
|
||||
|
||||
`job.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"protocol": "crawler.v1",
|
||||
"mode": "crawl",
|
||||
"run_id": "20260609T120000Z",
|
||||
"crawler_id": "example",
|
||||
"target_new": 10,
|
||||
"seen_source_ids_file": "/data/scriptcrawlers/example/.crawl/seen.txt",
|
||||
"output_dir": "/data/scriptcrawlers/example/output",
|
||||
"config": {
|
||||
"category": "hot"
|
||||
},
|
||||
"network": {
|
||||
"proxy_url": "http://127.0.0.1:7890"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Importing Scripts
|
||||
|
||||
Crawler scripts are configured from the admin crawler page. A script can be
|
||||
entered as an existing server path, uploaded as a local file, or imported from
|
||||
an HTTP(S) URL.
|
||||
|
||||
Imported scripts are copied into `crawler-scripts/` next to the configured local
|
||||
preview data directory. The import API currently accepts Python files only
|
||||
(`.py`) and rejects empty files or files larger than 2 MiB.
|
||||
|
||||
## Output
|
||||
|
||||
stdout must be JSON Lines. Logs must go to stderr.
|
||||
|
||||
Recommended item event:
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "item",
|
||||
"title": "Video title",
|
||||
"media_url": "https://cdn.example.test/video.mp4",
|
||||
"thumbnail_url": "https://cdn.example.test/cover.jpg",
|
||||
"source_id": "site-native-id",
|
||||
"headers": {
|
||||
"Referer": "https://example.test/"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Minimum item event:
|
||||
|
||||
```json
|
||||
{"type":"item","title":"Video title","media_url":"https://cdn.example.test/video.mp4"}
|
||||
```
|
||||
|
||||
If a line contains item fields such as `title` and `media_url`, the backend also
|
||||
treats it as an item when `type` is omitted.
|
||||
|
||||
The item fields may also be wrapped inside `"item"` if that is more convenient:
|
||||
|
||||
```json
|
||||
{"type":"item","item":{"title":"Video title","media_url":"https://cdn.example.test/video.mp4"}}
|
||||
```
|
||||
|
||||
Optional progress/done events:
|
||||
|
||||
```json
|
||||
{"type":"progress","checked":20,"emitted":3}
|
||||
{"type":"done","stats":{"emitted":10}}
|
||||
```
|
||||
|
||||
## Simple Field Rules
|
||||
|
||||
- `title` is required.
|
||||
- `media_url` is required for normal scripts. The backend downloads the video.
|
||||
- `thumbnail_url` is optional. If it is empty, the backend generates a thumbnail
|
||||
from the downloaded video.
|
||||
- `source_id` is optional but recommended. If present, it should be stable
|
||||
within one crawler and lets the backend skip known videos before downloading.
|
||||
If it is empty, the backend creates an internal `auto-...` ID and later relies
|
||||
on the existing video fingerprint dedupe path.
|
||||
- `headers` is optional and is applied to both video and thumbnail downloads.
|
||||
Use it for `Referer`, cookies or anti-hotlinking requirements.
|
||||
|
||||
## Advanced Fields
|
||||
|
||||
- `detail_url`, `author`, `tags`, `category`, `quality`, `duration_seconds`,
|
||||
`description` and `published_at` are optional metadata fields.
|
||||
- If video and thumbnail need different headers, use `media_headers` and
|
||||
`thumbnail_headers`.
|
||||
- Existing nested fields are still supported for compatibility:
|
||||
`media.url`, `media.local_file`, `media.headers`, `thumbnail.url`,
|
||||
`thumbnail.local_file`, `thumbnail.headers`.
|
||||
- Advanced scripts may download into `job.output_dir` and return
|
||||
`media_local_file` or `media.local_file`. The path must stay inside
|
||||
`output_dir`.
|
||||
- Scripts can read `seen_source_ids_file` and skip known IDs when they provide
|
||||
stable `source_id` values. The backend still dedupes every item.
|
||||
- The backend stops the process after `target_new` new videos are imported.
|
||||
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
|
||||
def load_seen(path):
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return {line.strip() for line in f if line.strip()}
|
||||
except FileNotFoundError:
|
||||
return set()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--job", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.job, "r", encoding="utf-8") as f:
|
||||
job = json.load(f)
|
||||
|
||||
seen = load_seen(job.get("seen_source_ids_file", ""))
|
||||
source_id = "demo-video-1"
|
||||
if source_id in seen:
|
||||
print(json.dumps({"type": "done", "stats": {"emitted": 0}}), flush=True)
|
||||
return
|
||||
|
||||
event = {
|
||||
"type": "item",
|
||||
"source_id": source_id,
|
||||
"title": "Demo Video",
|
||||
"media_url": "https://example.test/video/demo-video-1.mp4",
|
||||
"thumbnail_url": "https://example.test/thumb/demo-video-1.jpg",
|
||||
"headers": {
|
||||
"Referer": "https://example.test/",
|
||||
},
|
||||
}
|
||||
print(json.dumps(event, ensure_ascii=False), flush=True)
|
||||
print(json.dumps({"type": "done", "stats": {"emitted": 1}}), flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as exc:
|
||||
print(f"crawler failed: {exc}", file=sys.stderr, flush=True)
|
||||
raise
|
||||
@@ -8,6 +8,7 @@ import { AdminLayout } from "@/admin/AdminLayout";
|
||||
import { LoginPage } from "@/admin/LoginPage";
|
||||
import { RequireAuth } from "@/admin/RequireAuth";
|
||||
import { DrivesPage } from "@/admin/DrivesPage";
|
||||
import { CrawlersPage } from "@/admin/CrawlersPage";
|
||||
import { VideosPage } from "@/admin/VideosPage";
|
||||
import { TagsPage } from "@/admin/TagsPage";
|
||||
import { ThemePage } from "@/admin/ThemePage";
|
||||
@@ -70,6 +71,7 @@ export default function App() {
|
||||
>
|
||||
<Route index element={<Navigate to="/admin/drives" replace />} />
|
||||
<Route path="drives" element={<DrivesPage />} />
|
||||
<Route path="crawlers" element={<CrawlersPage />} />
|
||||
<Route path="videos" element={<VideosPage />} />
|
||||
<Route path="tags" element={<TagsPage />} />
|
||||
<Route path="theme" element={<ThemePage />} />
|
||||
|
||||
@@ -14,6 +14,7 @@ import {
|
||||
import * as api from "./api";
|
||||
import { useAuth } from "./AuthContext";
|
||||
import { useToast } from "./ToastContext";
|
||||
import { SpiderIcon } from "./icons/SpiderIcon";
|
||||
|
||||
export function AdminLayout() {
|
||||
const { logout } = useAuth();
|
||||
@@ -88,6 +89,14 @@ export function AdminLayout() {
|
||||
>
|
||||
<HardDrive size={16} /> 网盘管理
|
||||
</NavLink>
|
||||
<NavLink
|
||||
to="/admin/crawlers"
|
||||
className={({ isActive }) =>
|
||||
`admin-nav__link ${isActive ? "is-active" : ""}`
|
||||
}
|
||||
>
|
||||
<SpiderIcon size={16} /> 爬虫管理
|
||||
</NavLink>
|
||||
<NavLink
|
||||
to="/admin/videos"
|
||||
className={({ isActive }) =>
|
||||
|
||||
@@ -0,0 +1,402 @@
|
||||
import { useEffect, useMemo, useState } from "react";
|
||||
import { ArrowLeft, CircleStop, Download, Link as LinkIcon, Plus, Save, Trash2, Upload } from "lucide-react";
|
||||
import * as api from "./api";
|
||||
import { useToast } from "./ToastContext";
|
||||
import { driveKindAbbr, generationStateClass, generationStateLabel } from "./drive/constants";
|
||||
import { SpiderIcon } from "./icons/SpiderIcon";
|
||||
|
||||
type CrawlerForm = {
|
||||
id: string;
|
||||
name: string;
|
||||
builtin: string;
|
||||
scriptPath: string;
|
||||
pythonPath: string;
|
||||
targetNew: string;
|
||||
proxy: string;
|
||||
configJson: string;
|
||||
};
|
||||
|
||||
const emptyForm: CrawlerForm = {
|
||||
id: "",
|
||||
name: "",
|
||||
builtin: "",
|
||||
scriptPath: "",
|
||||
pythonPath: "python3",
|
||||
targetNew: "10",
|
||||
proxy: "",
|
||||
configJson: "",
|
||||
};
|
||||
|
||||
export function CrawlersPage() {
|
||||
const [list, setList] = useState<api.AdminCrawler[]>([]);
|
||||
const [selectedId, setSelectedId] = useState("");
|
||||
const [form, setForm] = useState<CrawlerForm>(emptyForm);
|
||||
const [loading, setLoading] = useState(true);
|
||||
const [saving, setSaving] = useState(false);
|
||||
const [runningId, setRunningId] = useState("");
|
||||
const [stoppingId, setStoppingId] = useState("");
|
||||
const [scriptURL, setScriptURL] = useState("");
|
||||
const [importingScript, setImportingScript] = useState(false);
|
||||
const [mode, setMode] = useState<"list" | "detail">("list");
|
||||
const { show } = useToast();
|
||||
|
||||
const selected = useMemo(
|
||||
() => list.find((item) => item.id === selectedId) ?? null,
|
||||
[list, selectedId]
|
||||
);
|
||||
|
||||
async function refresh() {
|
||||
setLoading(true);
|
||||
try {
|
||||
const data = await api.listCrawlers();
|
||||
setList(data);
|
||||
} catch (e) {
|
||||
show(e instanceof Error ? e.message : "加载爬虫失败", "error");
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
}
|
||||
|
||||
useEffect(() => {
|
||||
refresh();
|
||||
}, []);
|
||||
|
||||
function selectCrawler(crawler: api.AdminCrawler) {
|
||||
setSelectedId(crawler.id);
|
||||
setMode("detail");
|
||||
setForm({
|
||||
id: crawler.id,
|
||||
name: crawler.name,
|
||||
builtin: crawler.builtin ?? "",
|
||||
scriptPath: crawler.scriptPath ?? "",
|
||||
pythonPath: crawler.pythonPath || "python3",
|
||||
targetNew: crawler.targetNew || (crawler.builtin === "spider91" || crawler.kind === "spider91" ? "15" : "10"),
|
||||
proxy: crawler.proxy ?? "",
|
||||
configJson: crawler.configJson ?? "",
|
||||
});
|
||||
}
|
||||
|
||||
function createCustom() {
|
||||
setSelectedId("");
|
||||
setForm(emptyForm);
|
||||
setScriptURL("");
|
||||
setMode("detail");
|
||||
}
|
||||
|
||||
function createSpider91() {
|
||||
setSelectedId("");
|
||||
setForm({
|
||||
...emptyForm,
|
||||
id: "spider91",
|
||||
name: "91 爬虫",
|
||||
builtin: "spider91",
|
||||
scriptPath: "",
|
||||
targetNew: "15",
|
||||
});
|
||||
setScriptURL("");
|
||||
setMode("detail");
|
||||
}
|
||||
|
||||
function backToList() {
|
||||
setSelectedId("");
|
||||
setForm(emptyForm);
|
||||
setScriptURL("");
|
||||
setMode("list");
|
||||
}
|
||||
|
||||
function set<K extends keyof CrawlerForm>(key: K, value: CrawlerForm[K]) {
|
||||
setForm((prev) => ({ ...prev, [key]: value }));
|
||||
}
|
||||
|
||||
async function save() {
|
||||
const id = form.id.trim();
|
||||
const name = form.name.trim();
|
||||
if (!id || !name) {
|
||||
show("请填写爬虫 ID 和名称", "error");
|
||||
return;
|
||||
}
|
||||
if (!form.builtin && !form.scriptPath.trim()) {
|
||||
show("请先导入爬虫脚本", "error");
|
||||
return;
|
||||
}
|
||||
setSaving(true);
|
||||
try {
|
||||
const resp = await api.upsertCrawler({
|
||||
id,
|
||||
name,
|
||||
builtin: form.builtin,
|
||||
scriptPath: form.scriptPath.trim(),
|
||||
pythonPath: form.pythonPath.trim(),
|
||||
targetNew: form.targetNew.trim(),
|
||||
proxy: form.proxy.trim(),
|
||||
configJson: form.configJson.trim(),
|
||||
});
|
||||
if (resp.warning) {
|
||||
show(`已保存,但初始化失败:${resp.warning}`, "error");
|
||||
} else {
|
||||
show("已保存", "success");
|
||||
}
|
||||
setSelectedId(id);
|
||||
await refresh();
|
||||
setMode("list");
|
||||
} catch (e) {
|
||||
show(e instanceof Error ? e.message : "保存失败", "error");
|
||||
} finally {
|
||||
setSaving(false);
|
||||
}
|
||||
}
|
||||
|
||||
async function importScriptFile(file: File | null | undefined) {
|
||||
if (!file) return;
|
||||
setImportingScript(true);
|
||||
try {
|
||||
const resp = await api.importCrawlerScriptFile(file);
|
||||
set("scriptPath", resp.scriptPath);
|
||||
show("脚本已导入", "success");
|
||||
} catch (e) {
|
||||
show(e instanceof Error ? e.message : "导入失败", "error");
|
||||
} finally {
|
||||
setImportingScript(false);
|
||||
}
|
||||
}
|
||||
|
||||
async function importScriptURL() {
|
||||
const url = scriptURL.trim();
|
||||
if (!url) {
|
||||
show("请填写脚本链接", "error");
|
||||
return;
|
||||
}
|
||||
setImportingScript(true);
|
||||
try {
|
||||
const resp = await api.importCrawlerScriptURL(url);
|
||||
set("scriptPath", resp.scriptPath);
|
||||
setScriptURL("");
|
||||
show("脚本已导入", "success");
|
||||
} catch (e) {
|
||||
show(e instanceof Error ? e.message : "导入失败", "error");
|
||||
} finally {
|
||||
setImportingScript(false);
|
||||
}
|
||||
}
|
||||
|
||||
async function run(crawler: api.AdminCrawler) {
|
||||
setRunningId(crawler.id);
|
||||
try {
|
||||
const resp = await api.runCrawler(crawler.id);
|
||||
if (!resp.accepted) {
|
||||
show(resp.message || "当前爬虫有正在进行的任务", "info");
|
||||
return;
|
||||
}
|
||||
show("已触发抓取任务", "success");
|
||||
await refresh();
|
||||
} catch (e) {
|
||||
show(e instanceof Error ? e.message : "触发失败", "error");
|
||||
} finally {
|
||||
setRunningId("");
|
||||
}
|
||||
}
|
||||
|
||||
async function stop(crawler: api.AdminCrawler) {
|
||||
setStoppingId(crawler.id);
|
||||
try {
|
||||
const resp = await api.stopCrawlerTasks(crawler.id);
|
||||
show(resp.stopped ? "已请求停止任务" : "当前没有可停止任务", "info");
|
||||
await refresh();
|
||||
} catch (e) {
|
||||
show(e instanceof Error ? e.message : "停止失败", "error");
|
||||
} finally {
|
||||
setStoppingId("");
|
||||
}
|
||||
}
|
||||
|
||||
async function remove(crawler: api.AdminCrawler) {
|
||||
if (!window.confirm(`删除爬虫 ${crawler.name} 并清理它导入的视频?`)) return;
|
||||
try {
|
||||
const resp = await api.deleteCrawler(crawler.id);
|
||||
show(`已删除,并清理 ${resp.deletedVideos ?? 0} 个视频`, "success");
|
||||
setSelectedId("");
|
||||
setForm(emptyForm);
|
||||
setMode("list");
|
||||
await refresh();
|
||||
} catch (e) {
|
||||
show(e instanceof Error ? e.message : "删除失败", "error");
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<section className="admin-page">
|
||||
<header className="admin-page__header">
|
||||
<div>
|
||||
<h1 className="admin-page__title">爬虫管理</h1>
|
||||
</div>
|
||||
<div className="admin-detail-actions-inline">
|
||||
{mode === "list" ? (
|
||||
<button className="admin-btn is-primary" onClick={createCustom}>
|
||||
<Plus size={14} /> 添加爬虫
|
||||
</button>
|
||||
) : (
|
||||
<button className="admin-btn" onClick={backToList}>
|
||||
<ArrowLeft size={14} /> 返回列表
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
</header>
|
||||
|
||||
{mode === "list" ? (
|
||||
<div className="admin-card admin-crawler-list">
|
||||
<header className="admin-card__title">
|
||||
<SpiderIcon size={16} /> 已配置爬虫
|
||||
</header>
|
||||
{loading ? (
|
||||
<div className="admin-loading">加载中...</div>
|
||||
) : list.length === 0 ? (
|
||||
<div className="admin-empty">暂无爬虫</div>
|
||||
) : (
|
||||
<div className="admin-drive-teasers">
|
||||
{list.map((crawler) => (
|
||||
<button
|
||||
key={crawler.id}
|
||||
type="button"
|
||||
className={`admin-drive-teaser ${crawler.id === selectedId ? "is-active" : ""}`}
|
||||
onClick={() => selectCrawler(crawler)}
|
||||
>
|
||||
<span className="admin-drive-teaser__name">
|
||||
<span className="admin-drive-card__brand-icon" data-kind={crawler.builtin || crawler.kind}>
|
||||
{crawler.builtin === "spider91" ? "91" : driveKindAbbr(crawler.kind)}
|
||||
</span>
|
||||
{crawler.name}
|
||||
</span>
|
||||
<span className={`admin-status is-${crawler.status === "ok" ? "ok" : crawler.status === "error" ? "error" : "pending"}`}>
|
||||
{crawler.status === "ok" ? "已就绪" : crawler.status === "error" ? "错误" : "未连接"}
|
||||
</span>
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
) : (
|
||||
<div className="admin-crawler-detail">
|
||||
<div className="admin-card">
|
||||
<header className="admin-card__title">
|
||||
<SpiderIcon size={16} /> {selected ? "爬虫配置" : "添加爬虫"}
|
||||
</header>
|
||||
<div className="admin-form">
|
||||
{!selected && (
|
||||
<div className="admin-crawler-presets">
|
||||
<button className={`admin-btn ${form.builtin === "" ? "is-primary" : ""}`} type="button" onClick={createCustom}>
|
||||
<Plus size={13} /> 自定义脚本
|
||||
</button>
|
||||
<button className={`admin-btn ${form.builtin === "spider91" ? "is-primary" : ""}`} type="button" onClick={createSpider91}>
|
||||
<SpiderIcon size={13} /> 内置 91
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
<div className="admin-form__row">
|
||||
<label htmlFor="crawler-id">爬虫 ID *</label>
|
||||
<input id="crawler-id" value={form.id} onChange={(e) => set("id", e.target.value)} disabled={!!selected} />
|
||||
</div>
|
||||
<div className="admin-form__row">
|
||||
<label htmlFor="crawler-name">名称 *</label>
|
||||
<input id="crawler-name" value={form.name} onChange={(e) => set("name", e.target.value)} />
|
||||
</div>
|
||||
{!form.builtin && (
|
||||
<div className="admin-form__row">
|
||||
<label htmlFor="crawler-script-url">导入脚本</label>
|
||||
<div className="admin-crawler-import">
|
||||
<input
|
||||
id="crawler-script-file"
|
||||
className="admin-crawler-import__file"
|
||||
type="file"
|
||||
accept=".py,text/x-python"
|
||||
disabled={importingScript}
|
||||
onChange={(e) => {
|
||||
importScriptFile(e.target.files?.[0]);
|
||||
e.currentTarget.value = "";
|
||||
}}
|
||||
/>
|
||||
<label className="admin-btn" htmlFor="crawler-script-file" aria-disabled={importingScript}>
|
||||
<Upload size={13} /> 上传文件
|
||||
</label>
|
||||
<input
|
||||
id="crawler-script-url"
|
||||
value={scriptURL}
|
||||
onChange={(e) => setScriptURL(e.target.value)}
|
||||
placeholder="https://example.com/crawler.py"
|
||||
disabled={importingScript}
|
||||
/>
|
||||
<button className="admin-btn" type="button" onClick={importScriptURL} disabled={importingScript}>
|
||||
<LinkIcon size={13} /> {importingScript ? "导入中..." : "链接导入"}
|
||||
</button>
|
||||
</div>
|
||||
{form.scriptPath && <div className="admin-form__help">脚本已导入</div>}
|
||||
</div>
|
||||
)}
|
||||
<div className="admin-form__row">
|
||||
<label htmlFor="crawler-target">每次补充新视频数</label>
|
||||
<input id="crawler-target" value={form.targetNew} onChange={(e) => set("targetNew", e.target.value)} placeholder="10" />
|
||||
</div>
|
||||
<div className="admin-form__row">
|
||||
<label htmlFor="crawler-proxy">代理地址</label>
|
||||
<input id="crawler-proxy" value={form.proxy} onChange={(e) => set("proxy", e.target.value)} placeholder="http://127.0.0.1:7890" />
|
||||
</div>
|
||||
<div className="admin-detail-actions">
|
||||
<button className="admin-btn is-primary" onClick={save} disabled={saving}>
|
||||
<Save size={13} /> {saving ? "保存中..." : "保存"}
|
||||
</button>
|
||||
{selected && (
|
||||
<>
|
||||
<button className="admin-btn" onClick={() => run(selected)} disabled={runningId === selected.id}>
|
||||
<Download size={13} /> {runningId === selected.id ? "触发中..." : "立即抓取"}
|
||||
</button>
|
||||
<button className="admin-btn is-stop" onClick={() => stop(selected)} disabled={stoppingId === selected.id}>
|
||||
<CircleStop size={13} /> {stoppingId === selected.id ? "停止中..." : "停止任务"}
|
||||
</button>
|
||||
<button className="admin-btn is-danger" onClick={() => remove(selected)}>
|
||||
<Trash2 size={13} /> 删除
|
||||
</button>
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{selected && (
|
||||
<div className="admin-card admin-crawler-status">
|
||||
<header className="admin-card__title">
|
||||
<Download size={16} /> 状态
|
||||
</header>
|
||||
<div className="admin-gen-columns">
|
||||
<CrawlerStatus label="抓取" status={selected.scanGenerationStatus} />
|
||||
<CrawlerStatus label="封面" status={selected.thumbnailGenerationStatus} />
|
||||
<CrawlerStatus label="预览视频" status={selected.previewGenerationStatus} />
|
||||
<CrawlerStatus label="视频指纹" status={selected.fingerprintGenerationStatus} />
|
||||
</div>
|
||||
{selected.lastError && <div className="admin-detail-error">{selected.lastError}</div>}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</section>
|
||||
);
|
||||
}
|
||||
|
||||
function CrawlerStatus({ label, status }: { label: string; status?: api.DriveGenerationStatus }) {
|
||||
const state = status?.state || "idle";
|
||||
const labelText = label === "抓取" && state === "scanning" ? "抓取中" : generationStateLabel(state);
|
||||
return (
|
||||
<div className="admin-gen-col">
|
||||
<div className="admin-gen-col__head">
|
||||
<span className="admin-gen-col__label">{label}</span>
|
||||
<span className={`admin-status admin-generation-state is-${generationStateClass(state)}`}>
|
||||
{labelText}
|
||||
</span>
|
||||
</div>
|
||||
{label === "抓取" && (
|
||||
<div className="admin-gen-col__counts admin-gen-col__counts--scan">
|
||||
<div className="admin-gen-col__count"><span>已抓取</span><strong>{status?.scannedCount ?? 0}</strong></div>
|
||||
<div className="admin-gen-col__count"><span>预计新增</span><strong>{status?.addedCount ?? 0}</strong></div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
+18
-15
@@ -325,6 +325,10 @@ export function DrivesPage() {
|
||||
}
|
||||
|
||||
async function handleRescan(d: api.AdminDrive) {
|
||||
if (d.kind === "spider91") {
|
||||
show("91Spider 不再支持通过网盘运行,请到爬虫管理添加爬虫脚本", "info");
|
||||
return;
|
||||
}
|
||||
if (nightlyBusy) {
|
||||
show(nightlyBusyText(nightlyStatus) || NIGHTLY_BUSY_MESSAGE, "info");
|
||||
return;
|
||||
@@ -345,11 +349,7 @@ export function DrivesPage() {
|
||||
refreshDriveList();
|
||||
return;
|
||||
}
|
||||
if (d.kind === "spider91") {
|
||||
show("已触发抓取任务,需要 2-4 分钟,可稍后刷新视频列表查看", "success");
|
||||
} else {
|
||||
show("已触发扫描,可稍后刷新视频列表查看", "success");
|
||||
}
|
||||
show("已触发扫描,可稍后刷新视频列表查看", "success");
|
||||
refreshDriveList();
|
||||
} catch (e) {
|
||||
show(e instanceof Error ? e.message : "触发失败", "error");
|
||||
@@ -550,10 +550,8 @@ export function DrivesPage() {
|
||||
)}
|
||||
{d.kind === "spider91" && (
|
||||
<div className="admin-detail-row">
|
||||
<span className="admin-detail-label">上次抓取时间</span>
|
||||
<span className="admin-detail-value">
|
||||
{d.lastCrawlAt ? new Date(d.lastCrawlAt * 1000).toLocaleString() : "尚未抓取"}
|
||||
</span>
|
||||
<span className="admin-detail-label">配置状态</span>
|
||||
<span className="admin-detail-value">已废弃,请到爬虫管理添加</span>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
@@ -567,9 +565,12 @@ export function DrivesPage() {
|
||||
type="button"
|
||||
className="admin-btn is-primary"
|
||||
onClick={() => handleRescan(d)}
|
||||
aria-disabled={nightlyBusy || isDriveBusy(d) || !!scanningDriveIds[d.id]}
|
||||
disabled={d.kind === "spider91"}
|
||||
aria-disabled={d.kind === "spider91" || nightlyBusy || isDriveBusy(d) || !!scanningDriveIds[d.id]}
|
||||
title={
|
||||
nightlyBusy
|
||||
d.kind === "spider91"
|
||||
? "91Spider 不再支持通过网盘运行,请到爬虫管理添加爬虫脚本"
|
||||
: nightlyBusy
|
||||
? nightlyBusyText(nightlyStatus) || NIGHTLY_BUSY_MESSAGE
|
||||
: isDriveBusy(d) || scanningDriveIds[d.id]
|
||||
? DRIVE_BUSY_MESSAGE
|
||||
@@ -579,7 +580,7 @@ export function DrivesPage() {
|
||||
{d.kind === "spider91" ? (
|
||||
<>
|
||||
<Download size={13} className={scanningDriveIds[d.id] ? "admin-spin" : undefined} />
|
||||
{scanningDriveIds[d.id] ? "触发中..." : "立即抓取"}
|
||||
已废弃
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
@@ -599,9 +600,11 @@ export function DrivesPage() {
|
||||
{stoppingDriveId === d.id ? "停止中..." : "停止所有任务"}
|
||||
</button>
|
||||
</div>
|
||||
<button type="button" className="admin-btn" onClick={() => openEdit(d)}>
|
||||
{d.kind === "spider91" ? "编辑配置" : "编辑配置凭证"}
|
||||
</button>
|
||||
{d.kind !== "spider91" && (
|
||||
<button type="button" className="admin-btn" onClick={() => openEdit(d)}>
|
||||
编辑配置凭证
|
||||
</button>
|
||||
)}
|
||||
<button type="button" className="admin-btn is-danger admin-detail-actions__danger" onClick={() => setDeleteTarget(d)}>
|
||||
<Trash2 size={13} /> 删除网盘
|
||||
</button>
|
||||
|
||||
+98
-4
@@ -12,13 +12,14 @@ async function request<T>(
|
||||
path: string,
|
||||
init: RequestInit = {}
|
||||
): Promise<T> {
|
||||
const headers = new Headers(init.headers ?? {});
|
||||
if (!(init.body instanceof FormData) && !headers.has("Content-Type")) {
|
||||
headers.set("Content-Type", "application/json");
|
||||
}
|
||||
const res = await fetch(BASE + path, {
|
||||
credentials: "include",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
...(init.headers ?? {}),
|
||||
},
|
||||
...init,
|
||||
headers,
|
||||
});
|
||||
if (res.status === 401) {
|
||||
throw new UnauthorizedError();
|
||||
@@ -188,6 +189,99 @@ export function stopDriveTasks(id: string) {
|
||||
);
|
||||
}
|
||||
|
||||
// ---------- Crawlers ----------
|
||||
|
||||
export type AdminCrawler = {
|
||||
id: string;
|
||||
name: string;
|
||||
kind: "scriptcrawler" | "spider91";
|
||||
builtin?: string;
|
||||
status: string;
|
||||
lastError?: string;
|
||||
scriptPath: string;
|
||||
pythonPath?: string;
|
||||
proxy?: string;
|
||||
targetNew?: string;
|
||||
configJson?: string;
|
||||
lastCrawlAt?: number;
|
||||
scanGenerationStatus?: DriveGenerationStatus;
|
||||
thumbnailGenerationStatus?: DriveGenerationStatus;
|
||||
previewGenerationStatus?: DriveGenerationStatus;
|
||||
fingerprintGenerationStatus?: DriveGenerationStatus;
|
||||
thumbnailReadyCount: number;
|
||||
thumbnailPendingCount: number;
|
||||
thumbnailFailedCount: number;
|
||||
teaserReadyCount: number;
|
||||
teaserPendingCount: number;
|
||||
teaserFailedCount: number;
|
||||
fingerprintReadyCount: number;
|
||||
fingerprintPendingCount: number;
|
||||
fingerprintFailedCount: number;
|
||||
};
|
||||
|
||||
export type UpsertCrawlerInput = {
|
||||
id: string;
|
||||
name: string;
|
||||
builtin?: string;
|
||||
scriptPath: string;
|
||||
pythonPath?: string;
|
||||
proxy?: string;
|
||||
targetNew?: string;
|
||||
configJson?: string;
|
||||
};
|
||||
|
||||
export type ImportCrawlerScriptResult = {
|
||||
scriptPath: string;
|
||||
};
|
||||
|
||||
export function listCrawlers() {
|
||||
return request<AdminCrawler[]>("/crawlers");
|
||||
}
|
||||
|
||||
export function upsertCrawler(body: UpsertCrawlerInput) {
|
||||
return request<{ ok: boolean; warning?: string }>("/crawlers", {
|
||||
method: "POST",
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
}
|
||||
|
||||
export function importCrawlerScriptFile(file: File) {
|
||||
const form = new FormData();
|
||||
form.append("file", file);
|
||||
return request<ImportCrawlerScriptResult>("/crawlers/import-file", {
|
||||
method: "POST",
|
||||
body: form,
|
||||
});
|
||||
}
|
||||
|
||||
export function importCrawlerScriptURL(url: string) {
|
||||
return request<ImportCrawlerScriptResult>("/crawlers/import-url", {
|
||||
method: "POST",
|
||||
body: JSON.stringify({ url }),
|
||||
});
|
||||
}
|
||||
|
||||
export function runCrawler(id: string) {
|
||||
return request<{ ok: boolean; accepted: boolean; message?: string; status?: NightlyJobStatus }>(
|
||||
`/crawlers/${encodeURIComponent(id)}/run`,
|
||||
{ method: "POST" }
|
||||
);
|
||||
}
|
||||
|
||||
export function stopCrawlerTasks(id: string) {
|
||||
return request<{ ok: boolean; stopped: boolean }>(
|
||||
`/crawlers/${encodeURIComponent(id)}/tasks/stop`,
|
||||
{ method: "POST" }
|
||||
);
|
||||
}
|
||||
|
||||
export function deleteCrawler(id: string) {
|
||||
return request<{ ok: boolean; deletedVideos: number }>(`/crawlers/${encodeURIComponent(id)}`, {
|
||||
method: "DELETE",
|
||||
body: JSON.stringify({ deleteVideos: true }),
|
||||
});
|
||||
}
|
||||
|
||||
export type P123QRSession = {
|
||||
loginUuid: string;
|
||||
uniID: string;
|
||||
|
||||
@@ -101,13 +101,17 @@ export function StatusTag({
|
||||
error?: string;
|
||||
hasCred: boolean;
|
||||
}) {
|
||||
if (kind === "spider91") {
|
||||
return (
|
||||
<span className="admin-status is-error" title={error || "请到爬虫管理添加爬虫脚本"}>
|
||||
已废弃
|
||||
</span>
|
||||
);
|
||||
}
|
||||
if (kind !== "spider91" && !hasCred) {
|
||||
return <span className="admin-status is-pending">未配置凭证</span>;
|
||||
}
|
||||
if (status === "ok") {
|
||||
if (kind === "spider91") {
|
||||
return <span className="admin-status is-ok">已就绪</span>;
|
||||
}
|
||||
return <span className="admin-status is-ok">已连接</span>;
|
||||
}
|
||||
if (status === "error")
|
||||
@@ -205,7 +209,7 @@ export function DriveGenerationPanel({
|
||||
|
||||
<div className="admin-gen-columns">
|
||||
<DriveGenCol
|
||||
label={d.kind === "spider91" ? "抓取" : "扫盘"}
|
||||
label={d.kind === "spider91" ? "已废弃" : "扫盘"}
|
||||
status={d.scanGenerationStatus}
|
||||
showCounts={false}
|
||||
/>
|
||||
|
||||
@@ -26,7 +26,6 @@ const DRIVE_OPTIONS: DriveOption[] = [
|
||||
{ kind: "onedrive", label: "OneDrive", abbr: "OD", desc: "302直链,微软网盘" },
|
||||
{ kind: "googledrive", label: "Google Drive", abbr: "GD", desc: "服务器中转模式" },
|
||||
{ kind: "localstorage", label: "本地存储", abbr: "Lo", desc: "本机文件目录" },
|
||||
{ kind: "spider91", label: "91 爬虫", abbr: "91", desc: "自动抓取热门视频" },
|
||||
{ kind: "quark", label: "夸克网盘", abbr: "Qk", desc: "302直链" },
|
||||
{ kind: "wopan", label: "联通沃盘", abbr: "Wo", desc: "302直链" },
|
||||
];
|
||||
|
||||
@@ -163,7 +163,7 @@ export function credentialHelp(kind: Kind, isEdit: boolean): string {
|
||||
case "localstorage":
|
||||
return `填写服务器可访问的本地目录绝对路径,例如 /mnt/videos。系统会扫描该目录及子目录中的视频文件和 .strm 文件;.strm 可指向 HTTP/HTTPS 直链,或指向本地存储根目录内的真实视频路径。Docker 部署时请填写容器内路径。${note}`;
|
||||
case "spider91":
|
||||
return "91 爬虫会把定时抓取到的视频和封面先保存到本机,并作为一个视频来源接入站点;可按服务器网络情况单独配置代理。后续流水线会把较早的视频上传到你选择的 115 / PikPak / OneDrive 目标盘。";
|
||||
return "91Spider 不再支持通过网盘添加或编辑。请到后台爬虫管理页面添加内置 91 或自定义爬虫脚本。";
|
||||
default:
|
||||
return "";
|
||||
}
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
type SpiderIconProps = {
|
||||
size?: number;
|
||||
className?: string;
|
||||
};
|
||||
|
||||
export function SpiderIcon({ size = 16, className }: SpiderIconProps) {
|
||||
return (
|
||||
<svg
|
||||
className={className}
|
||||
width={size}
|
||||
height={size}
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
strokeWidth="2"
|
||||
strokeLinecap="round"
|
||||
strokeLinejoin="round"
|
||||
aria-hidden="true"
|
||||
focusable="false"
|
||||
>
|
||||
<path d="M12 7v3" />
|
||||
<path d="M9 5.5 7.5 3" />
|
||||
<path d="M15 5.5 16.5 3" />
|
||||
<path d="M9 10.5 4.5 8" />
|
||||
<path d="M15 10.5 19.5 8" />
|
||||
<path d="M8.5 13.5 3 13" />
|
||||
<path d="M15.5 13.5 21 13" />
|
||||
<path d="M9 16 5 20" />
|
||||
<path d="M15 16 19 20" />
|
||||
<ellipse cx="12" cy="14" rx="4" ry="5" />
|
||||
<circle cx="12" cy="8" r="2.5" />
|
||||
</svg>
|
||||
);
|
||||
}
|
||||
@@ -308,6 +308,33 @@
|
||||
margin-bottom: var(--space-3);
|
||||
}
|
||||
|
||||
.admin-crawler-layout {
|
||||
display: grid;
|
||||
grid-template-columns: minmax(260px, 0.8fr) minmax(360px, 1.2fr);
|
||||
gap: var(--space-4);
|
||||
align-items: start;
|
||||
}
|
||||
|
||||
.admin-crawler-detail {
|
||||
display: grid;
|
||||
gap: var(--space-4);
|
||||
}
|
||||
|
||||
.admin-crawler-status {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.admin-crawler-list .admin-drive-teaser {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.admin-crawler-presets {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: var(--space-2);
|
||||
margin-bottom: var(--space-3);
|
||||
}
|
||||
|
||||
/* ----- Storage summary ----- */
|
||||
.admin-storage-summary {
|
||||
display: grid;
|
||||
@@ -446,6 +473,21 @@
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.admin-crawler-import {
|
||||
display: grid;
|
||||
grid-template-columns: auto minmax(180px, 1fr) auto;
|
||||
gap: var(--space-2);
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.admin-crawler-import__file {
|
||||
position: absolute;
|
||||
width: 1px;
|
||||
height: 1px;
|
||||
opacity: 0;
|
||||
pointer-events: none;
|
||||
}
|
||||
|
||||
.admin-form__help {
|
||||
font-size: var(--font-xs);
|
||||
color: var(--text-faint);
|
||||
@@ -1548,6 +1590,15 @@
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.admin-crawler-import {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.admin-crawler-import .admin-btn {
|
||||
justify-content: center;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.admin-p123-qr__body {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
@@ -3332,6 +3383,7 @@
|
||||
}
|
||||
|
||||
@media (max-width: 900px) {
|
||||
.admin-crawler-layout,
|
||||
.admin-tags-layout {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
@@ -10,6 +10,18 @@ const driveComponentsSource = readFileSync(
|
||||
new URL("../src/admin/drive/DriveComponents.tsx", import.meta.url),
|
||||
"utf8"
|
||||
);
|
||||
const crawlerPageSource = readFileSync(
|
||||
new URL("../src/admin/CrawlersPage.tsx", import.meta.url),
|
||||
"utf8"
|
||||
);
|
||||
const adminLayoutSource = readFileSync(
|
||||
new URL("../src/admin/AdminLayout.tsx", import.meta.url),
|
||||
"utf8"
|
||||
);
|
||||
const appSource = readFileSync(
|
||||
new URL("../src/App.tsx", import.meta.url),
|
||||
"utf8"
|
||||
);
|
||||
const spider91UploadTargetSource = readFileSync(
|
||||
new URL("../src/admin/drive/Spider91UploadTargetField.tsx", import.meta.url),
|
||||
"utf8"
|
||||
@@ -51,14 +63,15 @@ function assertDriveTypeOption(value: string, label: string) {
|
||||
);
|
||||
}
|
||||
|
||||
test("spider91 drive form does not expose advanced crawler credentials", () => {
|
||||
assert.match(combinedSource, /key: "proxy"/);
|
||||
assert.match(combinedSource, /label: "代理地址(可选)"/);
|
||||
assert.match(combinedSource, /支持 http:\/\/、https:\/\/、socks5:\/\/、socks5h:\/\/代理/);
|
||||
assert.doesNotMatch(combinedSource, /target_new/);
|
||||
assert.doesNotMatch(combinedSource, /crawl_hour/);
|
||||
assert.doesNotMatch(combinedSource, /python_path/);
|
||||
assert.doesNotMatch(combinedSource, /script_path/);
|
||||
test("crawler sources are not selectable as storage drives", () => {
|
||||
assert.ok(
|
||||
!driveTypeOptions().some((option) => option.value === "spider91"),
|
||||
"spider91 should not be a storage drive option"
|
||||
);
|
||||
assert.ok(
|
||||
!driveTypeOptions().some((option) => option.value === "scriptcrawler"),
|
||||
"scriptcrawler should not be a storage drive option"
|
||||
);
|
||||
});
|
||||
|
||||
test("spider91 upload target uses explicit local-save option instead of auto target", () => {
|
||||
@@ -185,12 +198,45 @@ test("drive type selector keeps primary source order", () => {
|
||||
{ value: "onedrive", label: "OneDrive" },
|
||||
{ value: "googledrive", label: "Google Drive" },
|
||||
{ value: "localstorage", label: "本地存储" },
|
||||
{ value: "spider91", label: "91 爬虫" },
|
||||
{ value: "quark", label: "夸克网盘" },
|
||||
{ value: "wopan", label: "联通沃盘" },
|
||||
]);
|
||||
});
|
||||
|
||||
test("crawler management is a separate admin section", () => {
|
||||
assert.match(adminLayoutSource, /to="\/admin\/crawlers"/);
|
||||
assert.match(adminLayoutSource, /> 爬虫管理/);
|
||||
assert.match(adminLayoutSource, /SpiderIcon size=\{16\} \/> 爬虫管理/);
|
||||
assert.match(appSource, /path="crawlers" element=\{<CrawlersPage \/>/);
|
||||
assert.match(crawlerPageSource, /export function CrawlersPage/);
|
||||
assert.match(crawlerPageSource, /SpiderIcon/);
|
||||
assert.match(crawlerPageSource, /添加爬虫/);
|
||||
assert.match(crawlerPageSource, /返回列表/);
|
||||
assert.match(crawlerPageSource, /setMode\("detail"\)/);
|
||||
assert.match(crawlerPageSource, /setMode\("list"\)/);
|
||||
assert.match(crawlerPageSource, /api\.listCrawlers/);
|
||||
assert.match(crawlerPageSource, /api\.upsertCrawler/);
|
||||
assert.match(crawlerPageSource, /api\.runCrawler/);
|
||||
assert.match(crawlerPageSource, /api\.stopCrawlerTasks/);
|
||||
assert.match(crawlerPageSource, /api\.deleteCrawler/);
|
||||
assert.match(crawlerPageSource, /api\.importCrawlerScriptFile/);
|
||||
assert.match(crawlerPageSource, /api\.importCrawlerScriptURL/);
|
||||
assert.match(crawlerPageSource, /type="file"/);
|
||||
assert.match(crawlerPageSource, /链接导入/);
|
||||
assert.doesNotMatch(crawlerPageSource, /新建脚本/);
|
||||
assert.doesNotMatch(crawlerPageSource, /脚本路径/);
|
||||
assert.doesNotMatch(crawlerPageSource, /Python 解释器/);
|
||||
assert.doesNotMatch(crawlerPageSource, /自定义配置 JSON/);
|
||||
assert.doesNotMatch(crawlerPageSource, /Bot/);
|
||||
assert.match(crawlerPageSource, /builtin:\s*"spider91"/);
|
||||
assert.match(apiSource, /type AdminCrawler/);
|
||||
assert.match(apiSource, /"\/crawlers"/);
|
||||
assert.match(apiSource, /"\/crawlers\/import-file"/);
|
||||
assert.match(apiSource, /"\/crawlers\/import-url"/);
|
||||
assert.match(apiSource, /new FormData\(\)/);
|
||||
assert.doesNotMatch(driveFormSource, /scriptcrawler/);
|
||||
});
|
||||
|
||||
test("drive cards use configured abbreviations and visible fallback icon colors", () => {
|
||||
assert.match(constantsSource, /googledrive:\s*"GD"/);
|
||||
assert.match(constantsSource, /function driveKindAbbr\(kind: string\)/);
|
||||
@@ -230,10 +276,9 @@ test("nightly scan duplicate trigger uses full-scan busy message", () => {
|
||||
});
|
||||
|
||||
test("drive generation panel shows scan or crawler status first", () => {
|
||||
assert.match(driveComponentsSource, /label=\{d\.kind === "spider91" \? "抓取" : "扫盘"\}/);
|
||||
assert.match(driveComponentsSource, /label=\{d\.kind === "spider91" \? "已废弃" : "扫盘"\}/);
|
||||
assert.match(driveComponentsSource, /status=\{d\.scanGenerationStatus\}/);
|
||||
assert.match(driveComponentsSource, /showCounts=\{false\}/);
|
||||
assert.match(driveComponentsSource, /label === "抓取" && state === "scanning" \? "抓取中"/);
|
||||
assert.match(driveComponentsSource, /status\?\.scannedCount/);
|
||||
assert.match(driveComponentsSource, /预计新增/);
|
||||
assert.match(apiSource, /scannedCount:\s*number/);
|
||||
@@ -241,6 +286,13 @@ test("drive generation panel shows scan or crawler status first", () => {
|
||||
assert.match(constantsSource, /if \(state === "scanning"\) return "扫盘中"/);
|
||||
});
|
||||
|
||||
test("legacy spider91 storage is disabled in drive management", () => {
|
||||
assert.match(drivesPageSource, /91Spider 不再支持通过网盘运行,请到爬虫管理添加爬虫脚本/);
|
||||
assert.match(drivesPageSource, /disabled=\{d\.kind === "spider91"\}/);
|
||||
assert.match(drivesPageSource, /已废弃,请到爬虫管理添加/);
|
||||
assert.match(constantsSource, /91Spider 不再支持通过网盘添加或编辑/);
|
||||
});
|
||||
|
||||
test("drive detail selection is stored in the URL history", () => {
|
||||
assert.match(drivesPageSource, /useSearchParams/);
|
||||
assert.match(drivesPageSource, /searchParams\.get\("drive"\)/);
|
||||
|
||||
Reference in New Issue
Block a user