From 96e423b952df7628205c89e91fa7be5c2cd918d4 Mon Sep 17 00:00:00 2001 From: nianzhibai <177086871+nianzhibai@users.noreply.github.com> Date: Thu, 11 Jun 2026 22:41:24 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=AE=8C=E5=96=84=E7=88=AC=E8=99=AB?= =?UTF-8?q?=E5=8E=BB=E9=87=8D=E3=80=81=E4=B8=8A=E4=BC=A0=E8=BF=9B=E5=BA=A6?= =?UTF-8?q?=E5=92=8C=E6=BA=90=E6=96=87=E4=BB=B6=E5=88=A0=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 为脚本爬虫增加候选预算、重复 source 记录和默认爬虫标签,避免重复视频占满目标新增数量。 新增爬虫上传迁移进度上报和管理页上传卡片,让每个爬虫可以展示本轮上传处理情况。 为视频删除增加可选删除云盘源文件能力,补齐播放页、管理页交互,并为多个网盘驱动实现 Remove 接口。 补充相关测试并更新爬虫协议文档。 --- .gitignore | 1 + backend/cmd/server/main.go | 196 +++++++++++++++-- backend/cmd/server/main_test.go | 87 +++++++- backend/internal/api/admin.go | 25 ++- backend/internal/api/admin_test.go | 57 +++++ backend/internal/catalog/catalog.go | 91 +++++++- backend/internal/catalog/schema.sql | 18 ++ backend/internal/drives/googledrive/driver.go | 13 ++ backend/internal/drives/iface.go | 6 + .../internal/drives/localstorage/driver.go | 35 +++ backend/internal/drives/localupload/driver.go | 26 +++ backend/internal/drives/onedrive/driver.go | 12 + backend/internal/drives/p115/driver.go | 18 ++ backend/internal/drives/p123/driver.go | 34 +++ backend/internal/drives/pikpak/driver.go | 14 ++ backend/internal/drives/quark/driver.go | 41 +++- .../internal/drives/scriptcrawler/crawler.go | 145 ++++++++---- .../drives/scriptcrawler/crawler_test.go | 170 +++++++++++++- .../internal/drives/scriptcrawler/driver.go | 41 ++++ backend/internal/drives/spider91/driver.go | 41 ++++ backend/internal/drives/wopan/driver.go | 18 ++ backend/internal/spider91migrate/migrator.go | 97 +++++++- docs/crawler-protocol.md | 16 +- src/admin/ConfirmModal.tsx | 4 + src/admin/CrawlersPage.tsx | 64 +++++- src/admin/VideosPage.tsx | 63 +++++- src/admin/api.ts | 10 +- src/admin/drive/constants.ts | 5 +- src/components/VideoActions.tsx | 23 +- src/data/videos.ts | 13 ++ src/pages/VideoDetailPage.tsx | 90 ++++++++ src/styles/admin.css | 44 ++++ src/styles/video-detail.css | 208 +++++++++++++++++- 33 files changed, 1620 insertions(+), 106 deletions(-) diff --git a/.gitignore b/.gitignore index 17e1957..af1c246 100644 --- a/.gitignore +++ b/.gitignore @@ -41,4 +41,5 @@ __pycache__/ /image003.jpg /image004.jpg /image005.png +/image006.png /image02.png diff --git a/backend/cmd/server/main.go b/backend/cmd/server/main.go index 206ff5b..38c8cbf 100644 --- a/backend/cmd/server/main.go +++ b/backend/cmd/server/main.go @@ -86,6 +86,7 @@ func main() { Registry: app.registry, GetTargetDriveID: func() string { return app.Spider91UploadDriveID() }, CommonThumbDir: app.commonThumbsDir(), + OnUploadProgress: app.updateCrawlerUploadProgress, }) // 初始化本地内置盘;外部云盘放到 HTTP 服务启动后异步挂载,避免上游 @@ -217,8 +218,8 @@ func main() { OnRegenFailedFingerprints: func(driveID string) { go app.regenFailedFingerprints(ctx, driveID) }, - OnDeleteVideo: func(reqCtx context.Context, videoID string) (api.DeleteVideoResult, error) { - return app.deleteVideo(reqCtx, videoID) + OnDeleteVideo: func(reqCtx context.Context, videoID string, deleteSource bool) (api.DeleteVideoResult, error) { + return app.deleteVideo(reqCtx, videoID, deleteSource) }, GetDriveGenerationStatuses: func() map[string]api.DriveGenerationStatuses { return app.driveGenerationStatuses() @@ -363,6 +364,10 @@ type App struct { // crawlerUploadRunning 去重"保存上传目标后检查本地未上传文件"的后台任务。 crawlerUploadMu sync.Mutex crawlerUploadRunning map[string]bool + + // uploadProgress 跟踪脚本爬虫迁移到云盘时的实时上传状态。 + uploadProgressMu sync.Mutex + uploadProgress map[string]driveUploadProgress } type driveScanProgress struct { @@ -370,6 +375,14 @@ type driveScanProgress struct { Added int } +type driveUploadProgress struct { + State string + CurrentTitle string + QueueLength int + DoneCount int + TotalCount int +} + type spider91MigrationRunner interface { RunOnce(ctx context.Context) error } @@ -522,6 +535,13 @@ func (a *App) driveGenerationStatuses() map[string]api.DriveGenerationStatuses { } a.scanQueueMu.Unlock() + a.uploadProgressMu.Lock() + uploadProgresses := make(map[string]driveUploadProgress, len(a.uploadProgress)) + for id, progress := range a.uploadProgress { + uploadProgresses[id] = progress + } + a.uploadProgressMu.Unlock() + a.mu.Lock() previewWorkers := make(map[string]*preview.Worker, len(a.workers)) for id, worker := range a.workers { @@ -537,7 +557,7 @@ func (a *App) driveGenerationStatuses() map[string]api.DriveGenerationStatuses { } a.mu.Unlock() - out := make(map[string]api.DriveGenerationStatuses, len(scanningDrives)+len(previewWorkers)+len(thumbWorkers)+len(fingerprintWorkers)) + out := make(map[string]api.DriveGenerationStatuses, len(scanningDrives)+len(previewWorkers)+len(thumbWorkers)+len(fingerprintWorkers)+len(uploadProgresses)) for id, running := range scanningDrives { if !running { continue @@ -566,9 +586,75 @@ func (a *App) driveGenerationStatuses() map[string]api.DriveGenerationStatuses { status.Fingerprint = generationStatusFromFingerprint(worker.Status()) out[id] = status } + for id, progress := range uploadProgresses { + state := progress.State + if state == "" { + state = "idle" + } + status := out[id] + status.Upload = api.GenerationStatus{ + State: state, + CurrentTitle: progress.CurrentTitle, + QueueLength: progress.QueueLength, + DoneCount: progress.DoneCount, + TotalCount: progress.TotalCount, + } + out[id] = status + } return out } +func (a *App) updateCrawlerUploadProgress(progress spider91migrate.UploadProgress) { + driveID := strings.TrimSpace(progress.DriveID) + if driveID == "" { + return + } + state := strings.TrimSpace(progress.State) + if state == "" { + state = "idle" + } + a.uploadProgressMu.Lock() + if a.uploadProgress == nil { + a.uploadProgress = make(map[string]driveUploadProgress) + } + if state == "idle" { + delete(a.uploadProgress, driveID) + a.uploadProgressMu.Unlock() + return + } + a.uploadProgress[driveID] = driveUploadProgress{ + State: state, + CurrentTitle: strings.TrimSpace(progress.CurrentTitle), + QueueLength: progress.QueueLength, + DoneCount: progress.DoneCount, + TotalCount: progress.TotalCount, + } + a.uploadProgressMu.Unlock() +} + +func (a *App) clearCrawlerUploadProgress(driveID string) bool { + driveID = strings.TrimSpace(driveID) + if driveID == "" { + return false + } + a.uploadProgressMu.Lock() + _, ok := a.uploadProgress[driveID] + delete(a.uploadProgress, driveID) + a.uploadProgressMu.Unlock() + return ok +} + +func (a *App) clearAllCrawlerUploadProgress() []string { + a.uploadProgressMu.Lock() + ids := make([]string, 0, len(a.uploadProgress)) + for id := range a.uploadProgress { + ids = append(ids, id) + } + a.uploadProgress = nil + a.uploadProgressMu.Unlock() + return ids +} + func generationStatusFromPreview(status preview.TaskStatus) api.GenerationStatus { state := status.State if state == "" { @@ -905,6 +991,7 @@ func (a *App) attachScriptCrawler(d *catalog.Drive, drv *scriptcrawler.Driver) { c := scriptcrawler.NewCrawler(scriptcrawler.CrawlerConfig{ Driver: drv, Catalog: a.cat, + CrawlerName: d.Name, SourceKind: sourceKind, PythonPath: pythonPath, ScriptPath: scriptPath, @@ -929,6 +1016,7 @@ func (a *App) attachScriptCrawler(d *catalog.Drive, drv *scriptcrawler.Driver) { a.scriptCrawlers[driveID] = c a.mu.Unlock() + a.ensureScriptCrawlerNameTag(driveID, sourceKind, d.Name) if sourceKind == spider91.Kind { a.ensureSpider91SourceTag(driveID) } @@ -959,6 +1047,24 @@ func (a *App) ensureSpider91SourceTag(driveID string) { }() } +func (a *App) ensureScriptCrawlerNameTag(driveID, sourceKind, crawlerName string) { + tagName := strings.TrimSpace(crawlerName) + if tagName == "" { + tagName = strings.TrimSpace(driveID) + } + if tagName == "" { + return + } + bgCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + go func() { + defer cancel() + prefix := scriptcrawler.BuildVideoIDForKind(sourceKind, driveID, "") + if _, err := a.cat.EnsureTagForVideoIDPrefix(bgCtx, prefix, tagName, nil, "legacy"); err != nil { + log.Printf("[scriptcrawler] drive=%s ensure crawler tag %q: %v", driveID, tagName, err) + } + }() +} + func (a *App) registerPreviewWorkers(ctx context.Context, driveID string, worker *preview.Worker, thumbWorker *preview.ThumbWorker, fingerprintWorker *fingerprint.Worker, cancel context.CancelFunc) { a.registerPreviewWorkersWithOptions(ctx, driveID, worker, thumbWorker, fingerprintWorker, cancel, true) } @@ -1185,6 +1291,13 @@ func (a *App) driveHasActiveWork(driveID string) bool { return true } + a.uploadProgressMu.Lock() + uploading := a.uploadProgress[driveID].State != "" + a.uploadProgressMu.Unlock() + if uploading { + return true + } + a.mu.Lock() previewWorker := a.workers[driveID] thumbWorker := a.thumbWorkers[driveID] @@ -1304,10 +1417,11 @@ func (a *App) stopDriveTasks(ctx context.Context, driveID string) bool { canceled := a.cancelDriveTaskContexts(driveID) queued := a.clearQueuedDriveTask(driveID) fingerprintQueued := a.clearFingerprintQueueing(driveID) + uploading := a.clearCrawlerUploadProgress(driveID) hadWorkers := a.resetDriveGenerationWorkers(ctx, driveID) - stopped := canceled > 0 || queued || fingerprintQueued || hadWorkers - log.Printf("[tasks] stop drive=%s stopped=%v canceled_tasks=%d queued=%v fingerprint_queue=%v workers=%v", - driveID, stopped, canceled, queued, fingerprintQueued, hadWorkers) + stopped := canceled > 0 || queued || fingerprintQueued || uploading || hadWorkers + log.Printf("[tasks] stop drive=%s stopped=%v canceled_tasks=%d queued=%v fingerprint_queue=%v uploading=%v workers=%v", + driveID, stopped, canceled, queued, fingerprintQueued, uploading, hadWorkers) return stopped } @@ -1325,6 +1439,9 @@ func (a *App) stopAllDriveTasks(ctx context.Context) int { for _, id := range a.clearAllFingerprintQueueing() { stoppedIDs[id] = struct{}{} } + for _, id := range a.clearAllCrawlerUploadProgress() { + stoppedIDs[id] = struct{}{} + } for _, id := range a.resetAllDriveGenerationWorkers(ctx) { stoppedIDs[id] = struct{}{} } @@ -1679,7 +1796,7 @@ func (a *App) cleanupMissingDriveVideos(ctx context.Context, driveID string, liv return removed, nil } -func (a *App) deleteVideo(ctx context.Context, videoID string) (api.DeleteVideoResult, error) { +func (a *App) deleteVideo(ctx context.Context, videoID string, deleteSource bool) (api.DeleteVideoResult, error) { if a == nil || a.cat == nil { return api.DeleteVideoResult{}, sql.ErrNoRows } @@ -1688,6 +1805,14 @@ func (a *App) deleteVideo(ctx context.Context, videoID string) (api.DeleteVideoR return api.DeleteVideoResult{}, err } + deletedSource := false + if deleteSource { + deletedSource, err = a.removeVideoSourceFile(ctx, v) + if err != nil { + return api.DeleteVideoResult{}, err + } + } + localDir := "" if a.cfg != nil { localDir = a.cfg.Storage.LocalPreviewDir @@ -1695,16 +1820,61 @@ func (a *App) deleteVideo(ctx context.Context, videoID string) (api.DeleteVideoR if err := removeLocalVideoAssets(localDir, v); err != nil { return api.DeleteVideoResult{}, fmt.Errorf("remove local assets for %s: %w", v.ID, err) } - deletedSource, err := a.removeSpider91SourceFile(ctx, v) - if err != nil { - return api.DeleteVideoResult{}, err - } if err := a.cat.DeleteVideoWithTombstone(ctx, v.ID); err != nil { return api.DeleteVideoResult{}, err } return api.DeleteVideoResult{OK: true, DeletedSource: deletedSource}, nil } +func (a *App) removeVideoSourceFile(ctx context.Context, v *catalog.Video) (bool, error) { + if v == nil { + return false, errors.New("remove video source: empty video") + } + if a == nil { + return false, fmt.Errorf("remove video source %s: app unavailable: %w", v.ID, drives.ErrNotSupported) + } + if strings.HasPrefix(v.ID, "spider91-") { + deleted, err := a.removeSpider91SourceFile(ctx, v) + if err != nil || deleted { + return deleted, err + } + if a.cat != nil { + if drive, driveErr := a.cat.GetDrive(ctx, v.DriveID); driveErr == nil && drive.Kind == spider91.Kind { + return false, nil + } + } else if strings.HasPrefix(v.ID, "spider91-"+v.DriveID+"-") { + return false, nil + } + } + fileID := strings.TrimSpace(v.FileID) + if fileID == "" { + return false, fmt.Errorf("remove video source %s: empty file id", v.ID) + } + if a == nil || a.registry == nil { + return false, fmt.Errorf("remove video source %s: drive registry unavailable: %w", v.ID, drives.ErrNotSupported) + } + if _, ok := a.registry.Get(v.DriveID); !ok { + if a.cat == nil { + return false, fmt.Errorf("remove video source %s: drive %s not attached: %w", v.ID, v.DriveID, drives.ErrNotSupported) + } + if err := a.ensureDriveAttached(ctx, v.DriveID); err != nil { + return false, fmt.Errorf("remove video source %s: attach drive %s: %w", v.ID, v.DriveID, err) + } + } + drv, ok := a.registry.Get(v.DriveID) + if !ok { + return false, fmt.Errorf("remove video source %s: drive %s not attached: %w", v.ID, v.DriveID, drives.ErrNotSupported) + } + remover, ok := drv.(drives.Remover) + if !ok { + return false, fmt.Errorf("remove video source %s: drive %s (%s) does not support source deletion: %w", v.ID, v.DriveID, drv.Kind(), drives.ErrNotSupported) + } + if err := remover.Remove(ctx, fileID); err != nil { + return false, fmt.Errorf("remove video source %s from drive %s: %w", v.ID, v.DriveID, err) + } + return true, nil +} + func (a *App) removeSpider91SourceFile(ctx context.Context, v *catalog.Video) (bool, error) { if a == nil || a.cfg == nil || v == nil || !strings.HasPrefix(v.ID, "spider91-") { return false, nil @@ -2642,8 +2812,8 @@ func (a *App) runScriptCrawlerCrawlWithTaskContext(ctx context.Context, driveID if runErr != nil { log.Printf("[scriptcrawler] drive=%s crawl failed: %v", driveID, runErr) } else if res != nil { - log.Printf("[scriptcrawler] drive=%s crawl done target=%d total=%d new=%d skipped=%d failed=%d seen_snapshot=%d", - driveID, res.TargetNew, res.TotalEntries, res.NewVideos, res.Skipped, res.Failed, res.SeenSnapshot) + log.Printf("[scriptcrawler] drive=%s crawl done target=%d candidate_budget=%d total=%d new=%d skipped=%d failed=%d seen_snapshot=%d", + driveID, res.TargetNew, res.CandidateBudget, res.TotalEntries, res.NewVideos, res.Skipped, res.Failed, res.SeenSnapshot) } if d.Credentials == nil { diff --git a/backend/cmd/server/main_test.go b/backend/cmd/server/main_test.go index 2d04add..571f534 100644 --- a/backend/cmd/server/main_test.go +++ b/backend/cmd/server/main_test.go @@ -1243,7 +1243,7 @@ func TestDeleteVideoRemovesGeneratedAssetsKeepsLocalOriginalAndTombstones(t *tes cfg: &config.Config{Storage: config.Storage{LocalPreviewDir: localDir}}, cat: cat, } - result, err := app.deleteVideo(ctx, "localstorage-local-main-file") + result, err := app.deleteVideo(ctx, "localstorage-local-main-file", false) if err != nil { t.Fatalf("delete video: %v", err) } @@ -1270,6 +1270,73 @@ func TestDeleteVideoRemovesGeneratedAssetsKeepsLocalOriginalAndTombstones(t *tes } } +func TestDeleteVideoRemovesSourceFileWhenRequested(t *testing.T) { + ctx := context.Background() + root := t.TempDir() + localDir := filepath.Join(root, "previews") + cat, err := catalog.Open(filepath.Join(t.TempDir(), "catalog.db")) + if err != nil { + t.Fatalf("open catalog: %v", err) + } + t.Cleanup(func() { _ = cat.Close() }) + + previewPath := filepath.Join(localDir, "video-with-source.mp4") + thumbPath := filepath.Join(localDir, "thumbs", "video-with-source.jpg") + for _, path := range []string{previewPath, thumbPath} { + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatalf("mkdir %s: %v", path, err) + } + if err := os.WriteFile(path, []byte("file"), 0o644); err != nil { + t.Fatalf("write %s: %v", path, err) + } + } + + now := time.Now() + if err := cat.UpsertVideo(ctx, &catalog.Video{ + ID: "video-with-source", + DriveID: "source-drive", + FileID: "source-file", + FileName: "clip.mp4", + Title: "Source File", + PreviewLocal: previewPath, + PreviewStatus: "ready", + ThumbnailURL: "/p/thumb/video-with-source", + Size: 123, + PublishedAt: now, + CreatedAt: now, + UpdatedAt: now, + }); err != nil { + t.Fatalf("seed video: %v", err) + } + + registry := proxy.NewRegistry() + drv := &serverRemovableFakeDrive{id: "source-drive"} + registry.Set(drv.ID(), drv) + app := &App{ + cfg: &config.Config{Storage: config.Storage{LocalPreviewDir: localDir}}, + cat: cat, + registry: registry, + } + result, err := app.deleteVideo(ctx, "video-with-source", true) + if err != nil { + t.Fatalf("delete video: %v", err) + } + if !result.OK || !result.DeletedSource { + t.Fatalf("delete result = %#v, want source deleted", result) + } + if got, want := drv.removedFileID, "source-file"; got != want { + t.Fatalf("removed source fileID = %q, want %q", got, want) + } + if _, err := cat.GetVideo(ctx, "video-with-source"); err != sql.ErrNoRows { + t.Fatalf("deleted video lookup error = %v, want sql.ErrNoRows", err) + } + for _, path := range []string{previewPath, thumbPath} { + if _, err := os.Stat(path); !os.IsNotExist(err) { + t.Fatalf("generated asset %s still exists, stat err=%v", path, err) + } + } +} + func TestDeleteVideoRemovesSpider91SourceFile(t *testing.T) { ctx := context.Background() root := t.TempDir() @@ -1326,7 +1393,7 @@ func TestDeleteVideoRemovesSpider91SourceFile(t *testing.T) { t.Fatalf("seed video: %v", err) } - result, err := app.deleteVideo(ctx, "spider91-spider-main-source") + result, err := app.deleteVideo(ctx, "spider91-spider-main-source", true) if err != nil { t.Fatalf("delete spider video: %v", err) } @@ -1740,6 +1807,22 @@ type serverFakeKindDrive struct { func (d *serverFakeKindDrive) Kind() string { return d.kind } func (d *serverFakeKindDrive) ID() string { return d.id } +type serverRemovableFakeDrive struct { + serverFakeDrive + id string + removedFileID string +} + +func (d *serverRemovableFakeDrive) Kind() string { return "fake-removable" } +func (d *serverRemovableFakeDrive) ID() string { return d.id } +func (d *serverRemovableFakeDrive) Remove(ctx context.Context, fileID string) error { + if err := ctx.Err(); err != nil { + return err + } + d.removedFileID = fileID + return nil +} + type serverFakeSpider91MigrationRunner struct { called int } diff --git a/backend/internal/api/admin.go b/backend/internal/api/admin.go index c53467d..851a251 100644 --- a/backend/internal/api/admin.go +++ b/backend/internal/api/admin.go @@ -57,7 +57,7 @@ type AdminServer struct { OnRegenFailedPreviews func(driveID string) OnRegenFailedThumbnails func(driveID string) OnRegenFailedFingerprints func(driveID string) - OnDeleteVideo func(ctx context.Context, videoID string) (DeleteVideoResult, error) + OnDeleteVideo func(ctx context.Context, videoID string, deleteSource bool) (DeleteVideoResult, error) GetDriveGenerationStatuses func() map[string]DriveGenerationStatuses // OnTeaserEnabledChanged 在 per-drive 预览视频开关被切换后调用。 // enabled=true 时上层应该重新把 pending 预览视频入队(类似旧的全局开关从关到开); @@ -103,6 +103,8 @@ type GenerationStatus struct { CooldownUntil string `json:"cooldownUntil,omitempty"` ScannedCount int `json:"scannedCount"` AddedCount int `json:"addedCount"` + DoneCount int `json:"doneCount"` + TotalCount int `json:"totalCount"` } type DriveGenerationStatuses struct { @@ -110,6 +112,7 @@ type DriveGenerationStatuses struct { Thumbnail GenerationStatus `json:"thumbnail"` Preview GenerationStatus `json:"preview"` Fingerprint GenerationStatus `json:"fingerprint"` + Upload GenerationStatus `json:"upload"` } type NightlyJobStatus struct { @@ -127,6 +130,10 @@ type DeleteVideoResult struct { DeletedSource bool `json:"deletedSource"` } +type deleteVideoReq struct { + DeleteSource bool `json:"deleteSource"` +} + func (a *AdminServer) Register(r chi.Router) { r.Route("/admin/api", func(r chi.Router) { // 登录、登出和首次部署初始化不需要鉴权 @@ -637,6 +644,7 @@ type crawlerDTO struct { ThumbnailGenerationStatus GenerationStatus `json:"thumbnailGenerationStatus"` PreviewGenerationStatus GenerationStatus `json:"previewGenerationStatus"` FingerprintGenerationStatus GenerationStatus `json:"fingerprintGenerationStatus"` + UploadGenerationStatus GenerationStatus `json:"uploadGenerationStatus"` ThumbnailReadyCount int `json:"thumbnailReadyCount"` ThumbnailPendingCount int `json:"thumbnailPendingCount"` ThumbnailFailedCount int `json:"thumbnailFailedCount"` @@ -698,6 +706,9 @@ func (a *AdminServer) crawlerDTOForDrive(d *catalog.Drive, assets catalog.Crawle if generation.Fingerprint.State == "" { generation.Fingerprint.State = "idle" } + if generation.Upload.State == "" { + generation.Upload.State = "idle" + } lastCrawlAt := int64(0) if raw := strings.TrimSpace(d.Credentials["last_crawl_at"]); raw != "" { if v, err := strconv.ParseInt(raw, 10, 64); err == nil { @@ -719,6 +730,7 @@ func (a *AdminServer) crawlerDTOForDrive(d *catalog.Drive, assets catalog.Crawle ThumbnailGenerationStatus: generation.Thumbnail, PreviewGenerationStatus: generation.Preview, FingerprintGenerationStatus: generation.Fingerprint, + UploadGenerationStatus: generation.Upload, ThumbnailReadyCount: assets.Thumbnail.Ready, ThumbnailPendingCount: assets.Thumbnail.Pending, ThumbnailFailedCount: assets.Thumbnail.Failed, @@ -1824,12 +1836,21 @@ func (a *AdminServer) handleDeleteVideo(w http.ResponseWriter, r *http.Request) writeErr(w, http.StatusBadRequest, errors.New("invalid video id")) return } + var body deleteVideoReq + if r.Body != nil { + defer r.Body.Close() + decoder := json.NewDecoder(r.Body) + if err := decoder.Decode(&body); err != nil && !errors.Is(err, io.EOF) { + writeErr(w, http.StatusBadRequest, err) + return + } + } var ( result DeleteVideoResult err error ) if a.OnDeleteVideo != nil { - result, err = a.OnDeleteVideo(r.Context(), id) + result, err = a.OnDeleteVideo(r.Context(), id, body.DeleteSource) } else { err = a.Catalog.DeleteVideoWithTombstone(r.Context(), id) result = DeleteVideoResult{OK: err == nil} diff --git a/backend/internal/api/admin_test.go b/backend/internal/api/admin_test.go index fc1307b..712f12c 100644 --- a/backend/internal/api/admin_test.go +++ b/backend/internal/api/admin_test.go @@ -121,6 +121,63 @@ func TestHandleSetupStoresCredentialsAndCreatesSession(t *testing.T) { } } +func TestHandleDeleteVideoDefaultsDeleteSourceFalse(t *testing.T) { + req := httptest.NewRequest(http.MethodDelete, "/admin/api/videos/video-1", nil) + rctx := chi.NewRouteContext() + rctx.URLParams.Add("id", "video-1") + req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx)) + rr := httptest.NewRecorder() + + called := false + (&AdminServer{ + OnDeleteVideo: func(ctx context.Context, videoID string, deleteSource bool) (DeleteVideoResult, error) { + called = true + if videoID != "video-1" { + t.Fatalf("videoID = %q, want video-1", videoID) + } + if deleteSource { + t.Fatal("deleteSource defaulted to true") + } + return DeleteVideoResult{OK: true}, nil + }, + }).handleDeleteVideo(rr, req) + + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, want 200; body = %s", rr.Code, rr.Body.String()) + } + if !called { + t.Fatal("OnDeleteVideo was not called") + } +} + +func TestHandleDeleteVideoPassesDeleteSourceOption(t *testing.T) { + req := httptest.NewRequest(http.MethodDelete, "/admin/api/videos/video-1", strings.NewReader(`{"deleteSource":true}`)) + rctx := chi.NewRouteContext() + rctx.URLParams.Add("id", "video-1") + req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx)) + rr := httptest.NewRecorder() + + (&AdminServer{ + OnDeleteVideo: func(ctx context.Context, videoID string, deleteSource bool) (DeleteVideoResult, error) { + if !deleteSource { + t.Fatal("deleteSource = false, want true") + } + return DeleteVideoResult{OK: true, DeletedSource: true}, nil + }, + }).handleDeleteVideo(rr, req) + + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, want 200; body = %s", rr.Code, rr.Body.String()) + } + var got DeleteVideoResult + if err := json.Unmarshal(rr.Body.Bytes(), &got); err != nil { + t.Fatalf("decode response: %v", err) + } + if !got.DeletedSource { + t.Fatalf("DeletedSource = false, want true; response = %s", rr.Body.String()) + } +} + func TestHandleCheckUpdateReportsNewRelease(t *testing.T) { dir := t.TempDir() versionFile := filepath.Join(dir, ".version") diff --git a/backend/internal/catalog/catalog.go b/backend/internal/catalog/catalog.go index b993b00..f0f1786 100644 --- a/backend/internal/catalog/catalog.go +++ b/backend/internal/catalog/catalog.go @@ -88,6 +88,11 @@ type Video struct { func (c *Catalog) UpsertVideo(ctx context.Context, v *Video) error { existed := c.videoExists(ctx, v.ID) v.ContentHash = normalizeContentHash(v.ContentHash) + v.SampledSHA256 = normalizeContentHash(v.SampledSHA256) + fingerprintStatus := nullableStatus(v.FingerprintStatus) + if v.SampledSHA256 != "" && (v.FingerprintStatus == "" || v.FingerprintStatus == "pending") { + fingerprintStatus = "ready" + } tagsJSON, _ := json.Marshal(v.Tags) badgesJSON, _ := json.Marshal(v.Badges) now := time.Now().UnixMilli() @@ -98,13 +103,13 @@ func (c *Catalog) UpsertVideo(ctx context.Context, v *Video) error { _, err := c.db.ExecContext(ctx, ` INSERT INTO videos ( - id, drive_id, file_id, file_name, content_hash, parent_id, title, author, tags, + id, drive_id, file_id, file_name, content_hash, sampled_sha256, fingerprint_status, fingerprint_error, parent_id, title, author, tags, duration_seconds, size_bytes, ext, quality, thumbnail_url, thumbnail_status, preview_file_id, preview_local, preview_status, views, favorites, comments, likes, dislikes, category, hidden, badges, description, published_at, created_at, updated_at ) VALUES ( - ?, ?, ?, ?, ?, ?, ?, ?, ?, + ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CASE WHEN COALESCE(?, '') != '' THEN 'ready' ELSE 'pending' END, ?, ?, ?, ?, ?, ?, ?, ?, @@ -123,15 +128,18 @@ ON CONFLICT(id) DO UPDATE SET ELSE videos.content_hash END, sampled_sha256 = CASE - WHEN videos.size_bytes != excluded.size_bytes THEN '' + WHEN videos.size_bytes != excluded.size_bytes THEN excluded.sampled_sha256 + WHEN excluded.sampled_sha256 != '' THEN excluded.sampled_sha256 ELSE videos.sampled_sha256 END, fingerprint_status = CASE - WHEN videos.size_bytes != excluded.size_bytes THEN 'pending' + WHEN videos.size_bytes != excluded.size_bytes THEN COALESCE(excluded.fingerprint_status, 'pending') + WHEN excluded.sampled_sha256 != '' THEN COALESCE(excluded.fingerprint_status, 'ready') ELSE COALESCE(videos.fingerprint_status, 'pending') END, fingerprint_error = CASE - WHEN videos.size_bytes != excluded.size_bytes THEN '' + WHEN videos.size_bytes != excluded.size_bytes THEN COALESCE(excluded.fingerprint_error, '') + WHEN excluded.sampled_sha256 != '' THEN COALESCE(excluded.fingerprint_error, '') ELSE COALESCE(videos.fingerprint_error, '') END, duration_seconds= excluded.duration_seconds, @@ -152,7 +160,7 @@ ON CONFLICT(id) DO UPDATE SET description = excluded.description, updated_at = excluded.updated_at `, - v.ID, v.DriveID, v.FileID, v.FileName, v.ContentHash, v.ParentID, v.Title, v.Author, string(tagsJSON), + v.ID, v.DriveID, v.FileID, v.FileName, v.ContentHash, v.SampledSHA256, fingerprintStatus, v.FingerprintError, v.ParentID, v.Title, v.Author, string(tagsJSON), v.DurationSeconds, v.Size, v.Ext, v.Quality, v.ThumbnailURL, v.ThumbnailURL, v.PreviewFileID, v.PreviewLocal, nullableStatus(v.PreviewStatus), v.Views, v.Favorites, v.Comments, v.Likes, v.Dislikes, @@ -731,8 +739,11 @@ func (c *Catalog) ListCrawlerSourceIDs(ctx context.Context, kind, driveID string rows, err := c.db.QueryContext(ctx, `SELECT SUBSTR(id, ?) FROM videos WHERE id LIKE ? || '%' UNION - SELECT SUBSTR(id, ?) FROM deleted_videos WHERE id LIKE ? || '%'`, - len(prefix)+1, prefix, len(prefix)+1, prefix) + SELECT SUBSTR(id, ?) FROM deleted_videos WHERE id LIKE ? || '%' + UNION + SELECT source_id FROM crawler_seen_sources + WHERE kind = ? AND drive_id = ? AND status IN ('imported', 'duplicate')`, + len(prefix)+1, prefix, len(prefix)+1, prefix, kind, driveID) if err != nil { return nil, err } @@ -750,6 +761,47 @@ func (c *Catalog) ListCrawlerSourceIDs(ctx context.Context, kind, driveID string return out, rows.Err() } +// MarkCrawlerSourceSeen records the outcome for a crawler source item. Duplicate +// source IDs are included in future seen files so scripts can skip them before +// the backend downloads the same duplicate content again. +func (c *Catalog) MarkCrawlerSourceSeen(ctx context.Context, kind, driveID, sourceID, status, canonicalVideoID, sampledSHA256 string, size int64) error { + kind = strings.TrimSpace(kind) + driveID = strings.TrimSpace(driveID) + sourceID = strings.TrimSpace(sourceID) + status = strings.TrimSpace(status) + if kind == "" || driveID == "" || sourceID == "" { + return nil + } + switch status { + case "imported", "duplicate": + default: + return fmt.Errorf("catalog: unsupported crawler source status %q", status) + } + sampledSHA256 = normalizeContentHash(sampledSHA256) + if size < 0 { + size = 0 + } + now := time.Now().UnixMilli() + _, err := c.db.ExecContext(ctx, ` +INSERT INTO crawler_seen_sources ( + kind, drive_id, source_id, status, canonical_video_id, sampled_sha256, size_bytes, first_seen_at, last_seen_at +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT(kind, drive_id, source_id) DO UPDATE SET + status = excluded.status, + canonical_video_id = excluded.canonical_video_id, + sampled_sha256 = CASE + WHEN excluded.sampled_sha256 != '' THEN excluded.sampled_sha256 + ELSE crawler_seen_sources.sampled_sha256 + END, + size_bytes = CASE + WHEN excluded.size_bytes > 0 THEN excluded.size_bytes + ELSE crawler_seen_sources.size_bytes + END, + last_seen_at = excluded.last_seen_at`, + kind, driveID, sourceID, status, strings.TrimSpace(canonicalVideoID), sampledSHA256, size, now, now) + return err +} + // DeleteVideoWithTombstone records that an administrator explicitly deleted a // video, then removes the visible catalog row. The tombstone is used by // scanners/crawlers to avoid importing the same source file again. @@ -922,6 +974,29 @@ func (c *Catalog) FindVideoByFileSignature(ctx context.Context, fileName string, return scanVideo(row) } +// FindEquivalentVideo returns the earliest visible video that represents the +// same content as source by strong hash or sampled fingerprint, regardless of +// which drive currently owns it. +func (c *Catalog) FindEquivalentVideo(ctx context.Context, source *Video) (*Video, error) { + if source == nil { + return nil, sql.ErrNoRows + } + where, args, ok := equivalentVideoLookupWhere(source) + if !ok { + return nil, sql.ErrNoRows + } + args = append([]any{source.ID}, args...) + row := c.db.QueryRowContext(ctx, + `SELECT `+allVideoCols+` FROM videos + WHERE id != ? + AND COALESCE(hidden, 0) = 0 + AND COALESCE(file_id, '') != '' + AND (`+where+`) + ORDER BY created_at ASC, id ASC + LIMIT 1`, args...) + return scanVideo(row) +} + // FindEquivalentVideoOnDrive returns a visible video on driveID that represents // the same content as source by strong hash or sampled fingerprint. func (c *Catalog) FindEquivalentVideoOnDrive(ctx context.Context, source *Video, driveID string) (*Video, error) { diff --git a/backend/internal/catalog/schema.sql b/backend/internal/catalog/schema.sql index cd306f7..d85c35f 100644 --- a/backend/internal/catalog/schema.sql +++ b/backend/internal/catalog/schema.sql @@ -89,6 +89,24 @@ CREATE INDEX IF NOT EXISTS idx_deleted_videos_drive_hash CREATE INDEX IF NOT EXISTS idx_deleted_videos_drive_signature ON deleted_videos(drive_id, file_name, size_bytes); +-- 爬虫来源记录。用于把已确认重复的 source_id 写回 seen 列表, +-- 避免后续爬虫反复下载同一个候选视频。 +CREATE TABLE IF NOT EXISTS crawler_seen_sources ( + kind TEXT NOT NULL, + drive_id TEXT NOT NULL, + source_id TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'imported', -- imported / duplicate + canonical_video_id TEXT NOT NULL DEFAULT '', + sampled_sha256 TEXT NOT NULL DEFAULT '', + size_bytes INTEGER NOT NULL DEFAULT 0, + first_seen_at INTEGER NOT NULL, + last_seen_at INTEGER NOT NULL, + PRIMARY KEY (kind, drive_id, source_id) +); + +CREATE INDEX IF NOT EXISTS idx_crawler_seen_sources_drive + ON crawler_seen_sources(kind, drive_id, status); + -- 网盘账户 CREATE TABLE IF NOT EXISTS drives ( id TEXT PRIMARY KEY, diff --git a/backend/internal/drives/googledrive/driver.go b/backend/internal/drives/googledrive/driver.go index 456cd88..1454dfc 100644 --- a/backend/internal/drives/googledrive/driver.go +++ b/backend/internal/drives/googledrive/driver.go @@ -593,6 +593,17 @@ func (d *Driver) Rename(ctx context.Context, fileID, newName string) error { return nil } +func (d *Driver) Remove(ctx context.Context, fileID string) error { + fileID = strings.TrimSpace(fileID) + if fileID == "" { + return errors.New("googledrive remove: empty file id") + } + if err := d.request(ctx, d.fileURL(fileID), http.MethodDelete, nil, nil); err != nil { + return fmt.Errorf("googledrive remove: %w", err) + } + return nil +} + func (d *Driver) findUploadedFileID(ctx context.Context, parentID, name, md5Hex string) (string, error) { entries, err := d.List(ctx, parentID) if err != nil { @@ -624,6 +635,8 @@ func (d *Driver) findUploadedFileID(ctx context.Context, parentID, name, md5Hex return "", fmt.Errorf("googledrive upload: uploaded file %q not found in parent %q", name, parentID) } +var _ drives.Remover = (*Driver)(nil) + func isGoogleUploadHTTPRateLimit(status int, header http.Header, body []byte, apiErr apiErrorBody) bool { if status == http.StatusTooManyRequests { return true diff --git a/backend/internal/drives/iface.go b/backend/internal/drives/iface.go index 55ae4ca..ff1f910 100644 --- a/backend/internal/drives/iface.go +++ b/backend/internal/drives/iface.go @@ -40,6 +40,12 @@ type Drive interface { RootID() string } +// Remover is an optional drive capability. It mirrors OpenList's optional +// Remove interface: callers must type-assert before deleting a source file. +type Remover interface { + Remove(ctx context.Context, fileID string) error +} + type Entry struct { ID string Name string diff --git a/backend/internal/drives/localstorage/driver.go b/backend/internal/drives/localstorage/driver.go index a37c1a9..a008ee4 100644 --- a/backend/internal/drives/localstorage/driver.go +++ b/backend/internal/drives/localstorage/driver.go @@ -257,6 +257,39 @@ func (d *Driver) EnsureDir(context.Context, string) (string, error) { return "", drives.ErrNotSupported } +func (d *Driver) Remove(ctx context.Context, fileID string) error { + if err := ctx.Err(); err != nil { + return err + } + p, rel, err := d.pathForID(fileID) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + if rel == "" { + return errors.New("localstorage: refusing to remove root") + } + info, err := os.Stat(p) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + if info.IsDir() { + return errors.New("localstorage: refusing to remove directory") + } + if !info.Mode().IsRegular() { + return errors.New("localstorage: refusing to remove non-regular file") + } + if err := os.Remove(p); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} + func (d *Driver) root() (string, error) { raw := strings.TrimSpace(d.rootPath) if raw == "" { @@ -276,6 +309,8 @@ func (d *Driver) root() (string, error) { return filepath.Abs(raw) } +var _ drives.Remover = (*Driver)(nil) + func (d *Driver) pathForID(id string) (string, string, error) { root, err := d.root() if err != nil { diff --git a/backend/internal/drives/localupload/driver.go b/backend/internal/drives/localupload/driver.go index bdbcb6d..3a545aa 100644 --- a/backend/internal/drives/localupload/driver.go +++ b/backend/internal/drives/localupload/driver.go @@ -78,12 +78,38 @@ func (d *Driver) EnsureDir(context.Context, string) (string, error) { return "", drives.ErrNotSupported } +func (d *Driver) Remove(ctx context.Context, fileID string) error { + if err := ctx.Err(); err != nil { + return err + } + path, err := d.uploadPath(fileID) + if err != nil { + return err + } + info, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + if info.IsDir() { + return errors.New("localupload: refusing to remove directory") + } + if err := os.Remove(path); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} + func (d *Driver) RootID() string { return d.uploadDir() } func (d *Driver) uploadDir() string { return d.uploadDirPath } +var _ drives.Remover = (*Driver)(nil) + func (d *Driver) uploadPath(fileID string) (string, error) { if strings.TrimSpace(fileID) == "" || filepath.Base(fileID) != fileID { return "", errors.New("invalid upload file id") diff --git a/backend/internal/drives/onedrive/driver.go b/backend/internal/drives/onedrive/driver.go index e01d3db..a722368 100644 --- a/backend/internal/drives/onedrive/driver.go +++ b/backend/internal/drives/onedrive/driver.go @@ -501,6 +501,17 @@ func (d *Driver) Rename(ctx context.Context, fileID, newName string) error { return nil } +func (d *Driver) Remove(ctx context.Context, fileID string) error { + fileID = strings.TrimSpace(fileID) + if fileID == "" { + return errors.New("onedrive remove: empty file id") + } + if err := d.request(ctx, d.itemURL(fileID), http.MethodDelete, nil, nil); err != nil { + return fmt.Errorf("onedrive remove: %w", err) + } + return nil +} + func (d *Driver) request(ctx context.Context, rawURL, method string, configure func(*resty.Request), out any) error { return d.requestOnce(ctx, rawURL, method, configure, out, true) } @@ -741,3 +752,4 @@ func guessMime(name string) string { } var _ drives.Drive = (*Driver)(nil) +var _ drives.Remover = (*Driver)(nil) diff --git a/backend/internal/drives/p115/driver.go b/backend/internal/drives/p115/driver.go index 960f807..eb67129 100644 --- a/backend/internal/drives/p115/driver.go +++ b/backend/internal/drives/p115/driver.go @@ -461,6 +461,23 @@ func (d *Driver) Rename(ctx context.Context, fileID, newName string) error { return nil } +func (d *Driver) Remove(ctx context.Context, fileID string) error { + if d.client == nil { + return errors.New("p115 remove: driver not initialized") + } + if err := ctx.Err(); err != nil { + return err + } + fileID = strings.TrimSpace(fileID) + if fileID == "" { + return errors.New("p115 remove: empty fileID") + } + if err := d.client.Delete(fileID); err != nil { + return fmt.Errorf("p115 remove: %w", err) + } + return nil +} + // bufferAndHashSha1 把 r 全量复制到一个临时文件,同时计算 SHA1。 // 返回临时文件(位置在末尾,需调用方 Seek 回 0)、SHA1 hex 大写、实际字节数。 // @@ -563,3 +580,4 @@ func guessMime(name string) string { } var _ drives.Drive = (*Driver)(nil) +var _ drives.Remover = (*Driver)(nil) diff --git a/backend/internal/drives/p123/driver.go b/backend/internal/drives/p123/driver.go index 683b629..db62d3d 100644 --- a/backend/internal/drives/p123/driver.go +++ b/backend/internal/drives/p123/driver.go @@ -42,6 +42,7 @@ const ( endpointDownloadInfo = "/file/download_info" endpointMkdir = "/file/upload_request" endpointRename = "/file/rename" + endpointTrash = "/file/trash" endpointUpload = "/file/upload_request" endpointS3Auth = "/file/s3_upload_object/auth" endpointS3Parts = "/file/s3_repare_upload_parts_batch" @@ -545,6 +546,32 @@ func (d *Driver) Rename(ctx context.Context, fileID, newName string) error { return nil } +func (d *Driver) Remove(ctx context.Context, fileID string) error { + fileID = strings.TrimSpace(fileID) + if fileID == "" { + return errors.New("123pan remove: empty file id") + } + f, _, err := d.findFile(ctx, fileID) + if err != nil { + if strings.Contains(strings.ToLower(err.Error()), "not found") { + return nil + } + return fmt.Errorf("123pan remove metadata: %w", err) + } + body := map[string]any{ + "driveId": 0, + "operation": true, + "fileTrashInfoList": []panFile{f}, + } + if _, err := d.request(ctx, endpointTrash, http.MethodPost, func(req *resty.Request) { + req.SetBody(body) + }, nil); err != nil { + return fmt.Errorf("123pan remove: %w", err) + } + d.removeCachedFile(fileID) + return nil +} + func (d *Driver) EnsureDir(ctx context.Context, pathFromRoot string) (string, error) { parts := splitPath(pathFromRoot) currentID := d.rootID @@ -942,6 +969,12 @@ func (d *Driver) renameCachedFile(fileID, newName string) { } } +func (d *Driver) removeCachedFile(fileID string) { + d.fileMu.Lock() + delete(d.files, fileID) + d.fileMu.Unlock() +} + func (d *Driver) cachedFile(fileID string) (panFile, string, bool) { d.fileMu.RLock() defer d.fileMu.RUnlock() @@ -1111,3 +1144,4 @@ func guessMime(name string) string { } var _ drives.Drive = (*Driver)(nil) +var _ drives.Remover = (*Driver)(nil) diff --git a/backend/internal/drives/pikpak/driver.go b/backend/internal/drives/pikpak/driver.go index c7067b8..5486461 100644 --- a/backend/internal/drives/pikpak/driver.go +++ b/backend/internal/drives/pikpak/driver.go @@ -356,6 +356,19 @@ func (d *Driver) Rename(ctx context.Context, fileID, newName string) error { return nil } +func (d *Driver) Remove(ctx context.Context, fileID string) error { + fileID = strings.TrimSpace(fileID) + if fileID == "" { + return errors.New("pikpak remove: empty file id") + } + if err := d.request(ctx, filesURL+":batchTrash", http.MethodPost, func(req *resty.Request) { + req.SetBody(map[string]any{"ids": []string{fileID}}) + }, nil); err != nil { + return fmt.Errorf("pikpak remove: %w", err) + } + return nil +} + func (d *Driver) EnsureDir(ctx context.Context, pathFromRoot string) (string, error) { currentID := d.rootID for _, name := range splitPath(pathFromRoot) { @@ -565,3 +578,4 @@ func ParseBoolDefault(raw string, def bool) bool { } var _ drives.Drive = (*Driver)(nil) +var _ drives.Remover = (*Driver)(nil) diff --git a/backend/internal/drives/quark/driver.go b/backend/internal/drives/quark/driver.go index 74ce22a..e9b6fca 100644 --- a/backend/internal/drives/quark/driver.go +++ b/backend/internal/drives/quark/driver.go @@ -16,23 +16,23 @@ import ( ) const ( - defaultUA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) quark-cloud-drive/2.5.20 Chrome/100.0.4896.160 Electron/18.3.5.4-b478491100 Safari/537.36 Channel/pckk_other_ch" + defaultUA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) quark-cloud-drive/2.5.20 Chrome/100.0.4896.160 Electron/18.3.5.4-b478491100 Safari/537.36 Channel/pckk_other_ch" defaultReferer = "https://pan.quark.cn" defaultAPI = "https://drive.quark.cn/1/clouddrive" defaultPR = "ucpro" ) type Driver struct { - id string - cookie string - rootID string - ua string - referer string - apiBase string - pr string - client *resty.Client - onCookieUpdate func(string) - useTranscodingAddress bool + id string + cookie string + rootID string + ua string + referer string + apiBase string + pr string + client *resty.Client + onCookieUpdate func(string) + useTranscodingAddress bool } type Config struct { @@ -60,7 +60,7 @@ func New(c Config) *Driver { onCookieUpdate: c.OnCookieUpdate, } d.client = resty.New(). - SetTimeout(30 * time.Second). + SetTimeout(30*time.Second). SetHeader("Accept", "application/json, text/plain, */*"). SetHeader("Referer", d.referer). SetHeader("User-Agent", d.ua) @@ -269,6 +269,22 @@ func (d *Driver) Upload(ctx context.Context, parentID, name string, r io.Reader, return "", drives.ErrNotSupported } +func (d *Driver) Remove(ctx context.Context, fileID string) error { + fileID = strings.TrimSpace(fileID) + if fileID == "" { + return errors.New("quark remove: empty file id") + } + body := map[string]any{ + "action_type": 1, + "exclude_fids": []string{}, + "filelist": []string{fileID}, + } + if err := d.request(ctx, "/file/delete", http.MethodPost, nil, body, nil); err != nil { + return fmt.Errorf("quark remove: %w", err) + } + return nil +} + // ---------- helpers ---------- func fileToEntry(f *file, parentID string) drives.Entry { @@ -343,3 +359,4 @@ func setCookieValue(cookie, key, value string) string { } var _ drives.Drive = (*Driver)(nil) +var _ drives.Remover = (*Driver)(nil) diff --git a/backend/internal/drives/scriptcrawler/crawler.go b/backend/internal/drives/scriptcrawler/crawler.go index ba5efa4..f7a27d6 100644 --- a/backend/internal/drives/scriptcrawler/crawler.go +++ b/backend/internal/drives/scriptcrawler/crawler.go @@ -4,6 +4,7 @@ import ( "bufio" "context" "crypto/sha256" + "database/sql" "encoding/hex" "encoding/json" "errors" @@ -23,18 +24,23 @@ import ( "time" "github.com/video-site/backend/internal/catalog" + "github.com/video-site/backend/internal/fingerprint" "github.com/video-site/backend/internal/mediaasset" "golang.org/x/net/proxy" ) const ( - DefaultTargetNew = 10 - defaultUserAgent = "Mozilla/5.0 (compatible; video-site-91-scriptcrawler/1.0)" + DefaultTargetNew = 10 + defaultUserAgent = "Mozilla/5.0 (compatible; video-site-91-scriptcrawler/1.0)" + defaultCandidateMultiplier = 10 + defaultCandidateFloorExtra = 50 + defaultCandidateBudgetMax = 500 ) type CrawlerConfig struct { Driver *Driver Catalog *catalog.Catalog + CrawlerName string SourceKind string PythonPath string ScriptPath string @@ -75,16 +81,17 @@ func NewCrawler(cfg CrawlerConfig) *Crawler { } type CrawlResult struct { - TargetNew int - TotalEntries int - NewVideos int - Skipped int - Failed int - SeenSnapshot int - StartedAt time.Time - FinishedAt time.Time - JobFile string - SeenFile string + TargetNew int + CandidateBudget int + TotalEntries int + NewVideos int + Skipped int + Failed int + SeenSnapshot int + StartedAt time.Time + FinishedAt time.Time + JobFile string + SeenFile string } type CrawlProgress struct { @@ -105,6 +112,8 @@ type Job struct { RunID string `json:"run_id"` CrawlerID string `json:"crawler_id"` TargetNew int `json:"target_new"` + UniqueTarget int `json:"unique_target,omitempty"` + CandidateBudget int `json:"candidate_budget,omitempty"` SeenSourceIDsFile string `json:"seen_source_ids_file"` OutputDir string `json:"output_dir"` Config json.RawMessage `json:"config"` @@ -253,11 +262,12 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err if targetNew <= 0 { targetNew = DefaultTargetNew } + candidateBudget := candidateBudgetForTarget(targetNew) if err := c.cfg.Driver.Init(ctx); err != nil { return nil, fmt.Errorf("scriptcrawler: driver init: %w", err) } - result := &CrawlResult{TargetNew: targetNew, StartedAt: time.Now()} + result := &CrawlResult{TargetNew: targetNew, CandidateBudget: candidateBudget, StartedAt: time.Now()} defer func() { result.FinishedAt = time.Now() }() emit := func(p CrawlProgress) { if c.cfg.OnProgress == nil { @@ -293,11 +303,11 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err result.SeenSnapshot = seenCount emit(CrawlProgress{}) - if err := c.writeJobFile(jobPath, runID, targetNew, seenPath); err != nil { + if err := c.writeJobFile(jobPath, runID, targetNew, candidateBudget, seenPath); err != nil { return result, fmt.Errorf("scriptcrawler: write job: %w", err) } - cmd, stdout, err := c.startScript(ctx, jobPath, targetNew) + cmd, stdout, err := c.startScript(ctx, jobPath, targetNew, candidateBudget) if err != nil { return result, fmt.Errorf("scriptcrawler: start: %w", err) } @@ -408,7 +418,7 @@ func (c *Crawler) writeSeenSourceIDs(ctx context.Context, path string) (int, err return len(seen), nil } -func (c *Crawler) writeJobFile(path, runID string, targetNew int, seenPath string) error { +func (c *Crawler) writeJobFile(path, runID string, targetNew, candidateBudget int, seenPath string) error { cfg := json.RawMessage([]byte("{}")) if raw := strings.TrimSpace(c.cfg.ConfigJSON); raw != "" { if !json.Valid([]byte(raw)) { @@ -425,7 +435,9 @@ func (c *Crawler) writeJobFile(path, runID string, targetNew int, seenPath strin Mode: "crawl", RunID: runID, CrawlerID: c.cfg.Driver.ID(), - TargetNew: targetNew, + TargetNew: candidateBudget, + UniqueTarget: targetNew, + CandidateBudget: candidateBudget, SeenSourceIDsFile: seenPath, OutputDir: outputDir, Config: cfg, @@ -442,7 +454,7 @@ func (c *Crawler) writeJobFile(path, runID string, targetNew int, seenPath strin return os.Rename(tmp, path) } -func (c *Crawler) startScript(ctx context.Context, jobPath string, targetNew int) (*exec.Cmd, io.ReadCloser, error) { +func (c *Crawler) startScript(ctx context.Context, jobPath string, targetNew, candidateBudget int) (*exec.Cmd, io.ReadCloser, error) { cmd := exec.CommandContext(ctx, c.cfg.PythonPath, c.cfg.ScriptPath, "--job", jobPath) if strings.TrimSpace(c.cfg.WorkDir) != "" { cmd.Dir = c.cfg.WorkDir @@ -466,7 +478,7 @@ func (c *Crawler) startScript(ctx context.Context, jobPath string, targetNew int _ = stdout.Close() return nil, nil, err } - log.Printf("[scriptcrawler] drive=%s exec %s --job=%s target_new=%d", c.cfg.Driver.ID(), c.cfg.ScriptPath, jobPath, targetNew) + log.Printf("[scriptcrawler] drive=%s exec %s --job=%s unique_target=%d candidate_budget=%d", c.cfg.Driver.ID(), c.cfg.ScriptPath, jobPath, targetNew, candidateBudget) if err := cmd.Start(); err != nil { _ = stdout.Close() _ = stderr.Close() @@ -493,7 +505,8 @@ func (c *Crawler) processItem(ctx context.Context, item Item) (bool, error) { if err != nil { return false, err } - videoID := BuildVideoIDForKind(c.sourceKind(), c.cfg.Driver.ID(), sourceID) + sourceKind := c.sourceKind() + videoID := BuildVideoIDForKind(sourceKind, c.cfg.Driver.ID(), sourceID) if deleted, err := c.cfg.Catalog.IsVideoDeleted(ctx, videoID); err != nil { return false, err } else if deleted { @@ -513,25 +526,6 @@ func (c *Crawler) processItem(ctx context.Context, item Item) (bool, error) { return false, fmt.Errorf("video: %w", err) } - thumbReady := false - if item.Thumbnail.URL != "" || item.Thumbnail.LocalFile != "" { - thumbFile := sourceID + detectThumbExt(item.Thumbnail.URL, item.Thumbnail.LocalFile) - thumbPath, err := c.cfg.Driver.ThumbPath(thumbFile) - if err == nil { - if _, err := c.materializeMedia(ctx, item.Thumbnail, thumbPath, item.DetailURL, false); err != nil { - log.Printf("[scriptcrawler] drive=%s source_id=%s thumbnail failed: %v", c.cfg.Driver.ID(), sourceID, err) - } else if c.cfg.CommonThumbDir != "" { - if err := os.MkdirAll(c.cfg.CommonThumbDir, 0o755); err != nil { - log.Printf("[scriptcrawler] drive=%s common thumbs mkdir: %v", c.cfg.Driver.ID(), err) - } else if err := copyFileAtomic(thumbPath, mediaasset.ThumbnailPathInDir(c.cfg.CommonThumbDir, videoID)); err != nil { - log.Printf("[scriptcrawler] drive=%s source_id=%s copy thumbnail: %v", c.cfg.Driver.ID(), sourceID, err) - } else { - thumbReady = true - } - } - } - } - now := time.Now() title := strings.TrimSpace(item.Title) if title == "" { @@ -545,6 +539,9 @@ func (c *Crawler) processItem(ctx context.Context, item Item) (bool, error) { if matched, err := c.cfg.Catalog.MatchTags(ctx, title+" "+author+" "+strings.Join(tags, " ")); err == nil { tags = mergeStringLists(tags, matched) } + if crawlerTag := c.crawlerTagName(); crawlerTag != "" { + tags = mergeStringLists(tags, []string{crawlerTag}) + } publishedAt := now if parsed := parsePublishedAt(item.PublishedAt); !parsed.IsZero() { publishedAt = parsed @@ -572,6 +569,43 @@ func (c *Crawler) processItem(ctx context.Context, item Item) (bool, error) { CreatedAt: now, UpdatedAt: now, } + sampled, err := fingerprint.Compute(ctx, c.cfg.Driver, v, fingerprint.Config{}, c.cfg.HTTPClient) + if err != nil { + _ = os.Remove(videoPath) + return false, fmt.Errorf("fingerprint: %w", err) + } + v.SampledSHA256 = sampled + v.FingerprintStatus = "ready" + if duplicate, err := c.cfg.Catalog.FindEquivalentVideo(ctx, v); err == nil && duplicate != nil { + _ = os.Remove(videoPath) + if markErr := c.cfg.Catalog.MarkCrawlerSourceSeen(ctx, sourceKind, c.cfg.Driver.ID(), sourceID, "duplicate", duplicate.ID, sampled, size); markErr != nil { + log.Printf("[scriptcrawler] drive=%s source_id=%s mark duplicate seen: %v", c.cfg.Driver.ID(), sourceID, markErr) + } + log.Printf("[scriptcrawler] drive=%s source_id=%s duplicate_of=%s title=%q size=%d", c.cfg.Driver.ID(), sourceID, duplicate.ID, title, size) + return false, nil + } else if err != nil && !errors.Is(err, sql.ErrNoRows) { + _ = os.Remove(videoPath) + return false, fmt.Errorf("duplicate lookup: %w", err) + } + + thumbReady := false + if item.Thumbnail.URL != "" || item.Thumbnail.LocalFile != "" { + thumbFile := sourceID + detectThumbExt(item.Thumbnail.URL, item.Thumbnail.LocalFile) + thumbPath, err := c.cfg.Driver.ThumbPath(thumbFile) + if err == nil { + if _, err := c.materializeMedia(ctx, item.Thumbnail, thumbPath, item.DetailURL, false); err != nil { + log.Printf("[scriptcrawler] drive=%s source_id=%s thumbnail failed: %v", c.cfg.Driver.ID(), sourceID, err) + } else if c.cfg.CommonThumbDir != "" { + if err := os.MkdirAll(c.cfg.CommonThumbDir, 0o755); err != nil { + log.Printf("[scriptcrawler] drive=%s common thumbs mkdir: %v", c.cfg.Driver.ID(), err) + } else if err := copyFileAtomic(thumbPath, mediaasset.ThumbnailPathInDir(c.cfg.CommonThumbDir, videoID)); err != nil { + log.Printf("[scriptcrawler] drive=%s source_id=%s copy thumbnail: %v", c.cfg.Driver.ID(), sourceID, err) + } else { + thumbReady = true + } + } + } + } if thumbReady { v.ThumbnailURL = "/p/thumb/" + v.ID } @@ -579,6 +613,9 @@ func (c *Crawler) processItem(ctx context.Context, item Item) (bool, error) { _ = os.Remove(videoPath) return false, err } + if err := c.cfg.Catalog.MarkCrawlerSourceSeen(ctx, sourceKind, c.cfg.Driver.ID(), sourceID, "imported", v.ID, sampled, size); err != nil { + log.Printf("[scriptcrawler] drive=%s source_id=%s mark imported seen: %v", c.cfg.Driver.ID(), sourceID, err) + } log.Printf("[scriptcrawler] drive=%s source_id=%s ok title=%q size=%d", c.cfg.Driver.ID(), sourceID, title, size) return true, nil } @@ -898,6 +935,36 @@ func (c *Crawler) sourceKind() string { return Kind } +func (c *Crawler) crawlerTagName() string { + if c == nil { + return "" + } + if v := strings.TrimSpace(c.cfg.CrawlerName); v != "" { + return v + } + if c.cfg.Driver != nil { + return strings.TrimSpace(c.cfg.Driver.ID()) + } + return "" +} + +func candidateBudgetForTarget(targetNew int) int { + if targetNew <= 0 { + targetNew = DefaultTargetNew + } + budget := targetNew * defaultCandidateMultiplier + if floor := targetNew + defaultCandidateFloorExtra; budget < floor { + budget = floor + } + if budget > defaultCandidateBudgetMax { + budget = defaultCandidateBudgetMax + } + if budget < targetNew { + return targetNew + } + return budget +} + func BuildVideoID(driveID, sourceID string) string { return BuildVideoIDForKind(Kind, driveID, sourceID) } diff --git a/backend/internal/drives/scriptcrawler/crawler_test.go b/backend/internal/drives/scriptcrawler/crawler_test.go index 440e286..23572c3 100644 --- a/backend/internal/drives/scriptcrawler/crawler_test.go +++ b/backend/internal/drives/scriptcrawler/crawler_test.go @@ -10,8 +10,15 @@ import ( "path/filepath" "strings" "testing" + "time" "github.com/video-site/backend/internal/catalog" + "github.com/video-site/backend/internal/fingerprint" +) + +const ( + scriptCrawlerDuplicateBytes = "duplicate-video-bytes" + scriptCrawlerUniqueBytes = "unique-video-bytes" ) func TestCrawlerRunOnceImportsLocalFileAndSkipsExisting(t *testing.T) { @@ -42,10 +49,11 @@ func TestCrawlerRunOnceImportsLocalFileAndSkipsExisting(t *testing.T) { t.Setenv("GO_WANT_SCRIPTCRAWLER_HELPER", "1") c := NewCrawler(CrawlerConfig{ - Driver: drv, - Catalog: cat, - PythonPath: wrapper, - ScriptPath: dummyScript, + Driver: drv, + Catalog: cat, + CrawlerName: "Demo Crawler", + PythonPath: wrapper, + ScriptPath: dummyScript, }) res, err := c.RunOnce(ctx, 1) if err != nil { @@ -61,6 +69,9 @@ func TestCrawlerRunOnceImportsLocalFileAndSkipsExisting(t *testing.T) { if v.Title != "Imported From Helper" || v.FileID != "abc-123.mp4" || v.Size == 0 { t.Fatalf("video = title:%q file:%q size:%d", v.Title, v.FileID, v.Size) } + if !hasString(v.Tags, "Demo Crawler") { + t.Fatalf("video tags = %#v, want crawler name tag", v.Tags) + } if _, err := os.Stat(filepath.Join(drv.VideosDir(), "abc-123.mp4")); err != nil { t.Fatalf("video file not copied: %v", err) } @@ -267,6 +278,113 @@ func TestCrawlerRunOnceImportsSimpleMediaURLWithoutSourceID(t *testing.T) { } } +func TestCrawlerRunOnceSkipsFingerprintDuplicateAndContinues(t *testing.T) { + ctx := context.Background() + tmp := t.TempDir() + cat, err := catalog.Open(filepath.Join(tmp, "catalog.db")) + if err != nil { + t.Fatalf("open catalog: %v", err) + } + t.Cleanup(func() { + if err := cat.Close(); err != nil { + t.Fatalf("close catalog: %v", err) + } + }) + drv := New(Config{ID: "demo", RootDir: filepath.Join(tmp, "crawler")}) + if err := drv.Init(ctx); err != nil { + t.Fatalf("driver init: %v", err) + } + + seedFile := "seed-canonical.mp4" + if err := os.WriteFile(filepath.Join(drv.VideosDir(), seedFile), []byte(scriptCrawlerDuplicateBytes), 0o644); err != nil { + t.Fatalf("write seed video: %v", err) + } + seed := &catalog.Video{ + ID: "seed-for-hash", + DriveID: drv.ID(), + FileID: seedFile, + Title: "Seed", + Size: int64(len(scriptCrawlerDuplicateBytes)), + PublishedAt: time.Now(), + } + sampled, err := fingerprint.Compute(ctx, drv, seed, fingerprint.Config{}, nil) + if err != nil { + t.Fatalf("compute seed fingerprint: %v", err) + } + _ = os.Remove(filepath.Join(drv.VideosDir(), seedFile)) + + now := time.Now() + if err := cat.UpsertVideo(ctx, &catalog.Video{ + ID: "existing-canonical", + DriveID: "other-drive", + FileID: "existing.mp4", + FileName: "existing.mp4", + Title: "Existing Canonical", + Size: int64(len(scriptCrawlerDuplicateBytes)), + Ext: "mp4", + SampledSHA256: sampled, + FingerprintStatus: "ready", + PublishedAt: now, + CreatedAt: now, + UpdatedAt: now, + }); err != nil { + t.Fatalf("seed canonical video: %v", err) + } + + dummyScript := filepath.Join(tmp, "helper-script") + if err := os.WriteFile(dummyScript, []byte("helper"), 0o755); err != nil { + t.Fatalf("write dummy script: %v", err) + } + wrapper := filepath.Join(tmp, "helper-wrapper.sh") + wrapperScript := fmt.Sprintf("#!/bin/sh\nexec %q -test.run=TestScriptCrawlerHelperProcess \"$@\"\n", os.Args[0]) + if err := os.WriteFile(wrapper, []byte(wrapperScript), 0o755); err != nil { + t.Fatalf("write helper wrapper: %v", err) + } + + t.Setenv("GO_WANT_SCRIPTCRAWLER_HELPER", "1") + t.Setenv("GO_WANT_SCRIPTCRAWLER_DUP_UNIQUE", "1") + c := NewCrawler(CrawlerConfig{ + Driver: drv, + Catalog: cat, + PythonPath: wrapper, + ScriptPath: dummyScript, + }) + res, err := c.RunOnce(ctx, 1) + if err != nil { + t.Fatalf("run once: %v", err) + } + if res.NewVideos != 1 || res.Skipped != 1 || res.Failed != 0 || res.TotalEntries != 2 { + t.Fatalf("result = total:%d new:%d skipped:%d failed:%d, want 2/1/1/0", res.TotalEntries, res.NewVideos, res.Skipped, res.Failed) + } + if res.CandidateBudget <= res.TargetNew { + t.Fatalf("candidate budget = %d, target = %d; want expanded budget", res.CandidateBudget, res.TargetNew) + } + if _, err := cat.GetVideo(ctx, BuildVideoID("demo", "dup-source")); err == nil { + t.Fatal("duplicate candidate should not be imported") + } + if _, err := os.Stat(filepath.Join(drv.VideosDir(), "dup-source.mp4")); !os.IsNotExist(err) { + t.Fatalf("duplicate local file stat = %v, want removed", err) + } + v, err := cat.GetVideo(ctx, BuildVideoID("demo", "unique-source")) + if err != nil { + t.Fatalf("unique video should be imported: %v", err) + } + if v.SampledSHA256 == "" || v.FingerprintStatus != "ready" { + t.Fatalf("unique fingerprint = %q status=%q, want ready sampled fingerprint", v.SampledSHA256, v.FingerprintStatus) + } + seen, err := cat.ListCrawlerSourceIDs(ctx, Kind, "demo") + if err != nil { + t.Fatalf("list seen source ids: %v", err) + } + seenSet := map[string]bool{} + for _, id := range seen { + seenSet[id] = true + } + if !seenSet["dup-source"] || !seenSet["unique-source"] { + t.Fatalf("seen ids = %#v, want duplicate and imported source ids", seen) + } +} + func TestScriptCrawlerHelperProcess(t *testing.T) { if os.Getenv("GO_WANT_SCRIPTCRAWLER_HELPER") != "1" { return @@ -307,6 +425,41 @@ func TestScriptCrawlerHelperProcess(t *testing.T) { _ = json.NewEncoder(os.Stdout).Encode(event) os.Exit(0) } + if os.Getenv("GO_WANT_SCRIPTCRAWLER_DUP_UNIQUE") == "1" { + duplicateFile := filepath.Join(job.OutputDir, "duplicate.mp4") + if err := os.WriteFile(duplicateFile, []byte(scriptCrawlerDuplicateBytes), 0o644); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(2) + } + uniqueFile := filepath.Join(job.OutputDir, "unique.mp4") + if err := os.WriteFile(uniqueFile, []byte(scriptCrawlerUniqueBytes), 0o644); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(2) + } + for _, event := range []Event{ + { + Type: "item", + Item: Item{ + SourceID: "dup-source", + Title: "Duplicate Candidate", + Author: "helper", + Media: MediaRef{LocalFile: duplicateFile}, + }, + }, + { + Type: "item", + Item: Item{ + SourceID: "unique-source", + Title: "Unique Candidate", + Author: "helper", + Media: MediaRef{LocalFile: uniqueFile}, + }, + }, + } { + _ = json.NewEncoder(os.Stdout).Encode(event) + } + os.Exit(0) + } localFile := filepath.Join(job.OutputDir, "helper.mp4") if err := os.WriteFile(localFile, []byte("helper-video"), 0o644); err != nil { fmt.Fprintln(os.Stderr, err) @@ -324,3 +477,12 @@ func TestScriptCrawlerHelperProcess(t *testing.T) { _ = json.NewEncoder(os.Stdout).Encode(event) os.Exit(0) } + +func hasString(values []string, want string) bool { + for _, value := range values { + if value == want { + return true + } + } + return false +} diff --git a/backend/internal/drives/scriptcrawler/driver.go b/backend/internal/drives/scriptcrawler/driver.go index 554ee7b..21d5804 100644 --- a/backend/internal/drives/scriptcrawler/driver.go +++ b/backend/internal/drives/scriptcrawler/driver.go @@ -147,6 +147,46 @@ func (d *Driver) EnsureDir(context.Context, string) (string, error) { return "", drives.ErrNotSupported } +func (d *Driver) Remove(ctx context.Context, fileID string) error { + if err := ctx.Err(); err != nil { + return err + } + videoPath, err := d.VideoPath(fileID) + if err != nil { + return err + } + info, err := os.Stat(videoPath) + if err != nil { + if os.IsNotExist(err) { + removeThumbCandidates(d.ThumbPath, strings.TrimSuffix(fileID, filepath.Ext(fileID))) + return nil + } + return err + } + if info.IsDir() { + return errors.New("scriptcrawler: refusing to remove directory") + } + if err := os.Remove(videoPath); err != nil && !os.IsNotExist(err) { + return err + } + removeThumbCandidates(d.ThumbPath, strings.TrimSuffix(fileID, filepath.Ext(fileID))) + return nil +} + +func removeThumbCandidates(pathFor func(string) (string, error), stem string) { + stem = strings.TrimSpace(stem) + if stem == "" { + return + } + for _, ext := range []string{".jpg", ".jpeg", ".png", ".webp"} { + path, err := pathFor(stem + ext) + if err != nil { + continue + } + _ = os.Remove(path) + } +} + func safeJoin(root, fileID string) (string, error) { id := strings.TrimSpace(fileID) if id == "" || filepath.Base(id) != id { @@ -170,3 +210,4 @@ func safeJoin(root, fileID string) (string, error) { } var _ drives.Drive = (*Driver)(nil) +var _ drives.Remover = (*Driver)(nil) diff --git a/backend/internal/drives/spider91/driver.go b/backend/internal/drives/spider91/driver.go index a4251a5..beabddf 100644 --- a/backend/internal/drives/spider91/driver.go +++ b/backend/internal/drives/spider91/driver.go @@ -167,6 +167,46 @@ func (d *Driver) EnsureDir(ctx context.Context, pathFromRoot string) (string, er return "", drives.ErrNotSupported } +func (d *Driver) Remove(ctx context.Context, fileID string) error { + if err := ctx.Err(); err != nil { + return err + } + videoPath, err := d.VideoPath(fileID) + if err != nil { + return err + } + info, err := os.Stat(videoPath) + if err != nil { + if os.IsNotExist(err) { + removeThumbCandidates(d.ThumbPath, strings.TrimSuffix(fileID, filepath.Ext(fileID))) + return nil + } + return err + } + if info.IsDir() { + return errors.New("spider91: refusing to remove directory") + } + if err := os.Remove(videoPath); err != nil && !os.IsNotExist(err) { + return err + } + removeThumbCandidates(d.ThumbPath, strings.TrimSuffix(fileID, filepath.Ext(fileID))) + return nil +} + +func removeThumbCandidates(pathFor func(string) (string, error), stem string) { + stem = strings.TrimSpace(stem) + if stem == "" { + return + } + for _, ext := range []string{".jpg", ".jpeg", ".png", ".webp"} { + path, err := pathFor(stem + ext) + if err != nil { + continue + } + _ = os.Remove(path) + } +} + // safeJoin 把 fileID 拼到 root 下,保证最终路径不会逃出 root。 // fileID 必须是单纯的文件名(不含 / 或 .. 等组件)。 func safeJoin(root, fileID string) (string, error) { @@ -192,3 +232,4 @@ func safeJoin(root, fileID string) (string, error) { } var _ drives.Drive = (*Driver)(nil) +var _ drives.Remover = (*Driver)(nil) diff --git a/backend/internal/drives/wopan/driver.go b/backend/internal/drives/wopan/driver.go index 3b003af..82a52fa 100644 --- a/backend/internal/drives/wopan/driver.go +++ b/backend/internal/drives/wopan/driver.go @@ -11,6 +11,7 @@ import ( "time" sdk "github.com/OpenListTeam/wopan-sdk-go" + "github.com/go-resty/resty/v2" "github.com/video-site/backend/internal/drives" ) @@ -145,6 +146,22 @@ func (d *Driver) Upload(ctx context.Context, parentID, name string, r io.Reader, return fid, nil } +func (d *Driver) Remove(ctx context.Context, fileID string) error { + if d.client == nil { + return fmt.Errorf("wopan remove: driver not initialized") + } + fileID = strings.TrimSpace(fileID) + if fileID == "" { + return fmt.Errorf("wopan remove: empty file id") + } + if err := d.client.DeleteFile(d.spaceType(), nil, []string{fileID}, func(req *resty.Request) { + req.SetContext(ctx) + }); err != nil { + return fmt.Errorf("wopan remove: %w", err) + } + return nil +} + func (d *Driver) EnsureDir(ctx context.Context, pathFromRoot string) (string, error) { parts := splitPath(pathFromRoot) currentID := d.rootID @@ -229,3 +246,4 @@ func guessMime(name string) string { // 确保实现接口 var _ drives.Drive = (*Driver)(nil) +var _ drives.Remover = (*Driver)(nil) diff --git a/backend/internal/spider91migrate/migrator.go b/backend/internal/spider91migrate/migrator.go index 67e40bd..6a0cbb6 100644 --- a/backend/internal/spider91migrate/migrator.go +++ b/backend/internal/spider91migrate/migrator.go @@ -82,6 +82,15 @@ type UploadResult struct { Size int64 } +type UploadProgress struct { + DriveID string + State string + CurrentTitle string + QueueLength int + DoneCount int + TotalCount int +} + const ( spider91UploadDirName = "91 Spider" scriptCrawlerUploadRootDirName = "Script Crawlers" @@ -254,9 +263,10 @@ type Config struct { // CaptchaCooldown 是迁移 worker 在遇到 PikPak captcha 错误(error_code // 4002 / 9)后整体进入冷却的时长。冷却期间 runOnce 直接返回,不再发起任何 // PikPak API 请求,避免被进一步风控。0 时默认 5 分钟;< 0 关闭冷却(仅用于测试)。 - CaptchaCooldown time.Duration - CommonThumbDir string - OnMigrated func(videoID string) + CaptchaCooldown time.Duration + CommonThumbDir string + OnMigrated func(videoID string) + OnUploadProgress func(UploadProgress) } type Migrator struct { @@ -444,6 +454,20 @@ func (m *Migrator) runOnce(ctx context.Context) { } } +func (m *Migrator) reportUploadProgress(progress UploadProgress) { + if m == nil || m.cfg.OnUploadProgress == nil { + return + } + progress.DriveID = strings.TrimSpace(progress.DriveID) + if progress.DriveID == "" { + return + } + if progress.State == "" { + progress.State = "idle" + } + m.cfg.OnUploadProgress(progress) +} + // targetKindForLog 把当前目标盘 kind 转成对人友好的简称,用于日志。 // 解析失败时回退 "target"。 func (m *Migrator) targetKindForLog() string { @@ -669,8 +693,17 @@ func (m *Migrator) migrateDrive(ctx context.Context, plan migrationPlan) (int, e if skip < len(files) { candidates = files[skip:] } else { + m.reportUploadProgress(UploadProgress{DriveID: src.ID(), State: "idle"}) return 0, nil } + totalCandidates := len(candidates) + m.reportUploadProgress(UploadProgress{ + DriveID: src.ID(), + State: "uploading", + QueueLength: totalCandidates, + TotalCount: totalCandidates, + }) + defer m.reportUploadProgress(UploadProgress{DriveID: src.ID(), State: "idle"}) localVideos, err := m.cfg.Catalog.ListVideosByDriveID(ctx, src.ID(), 100000) if err != nil { @@ -684,7 +717,8 @@ func (m *Migrator) migrateDrive(ctx context.Context, plan migrationPlan) (int, e } migrated := 0 - for _, f := range candidates { + processed := 0 + for index, f := range candidates { if err := ctx.Err(); err != nil { return migrated, err } @@ -694,11 +728,35 @@ func (m *Migrator) migrateDrive(ctx context.Context, plan migrationPlan) (int, e v := m.findVideoForLocalFile(ctx, plan, f.name, byFileID) if v == nil { + processed++ + m.reportUploadProgress(UploadProgress{ + DriveID: src.ID(), + State: "uploading", + QueueLength: maxInt(totalCandidates-processed, 0), + DoneCount: processed, + TotalCount: totalCandidates, + }) continue } + m.reportUploadProgress(UploadProgress{ + DriveID: src.ID(), + State: "uploading", + CurrentTitle: v.Title, + QueueLength: maxInt(totalCandidates-index-1, 0), + DoneCount: processed, + TotalCount: totalCandidates, + }) if v.DriveID != src.ID() { CleanupSpider91Local(src, f.name) + processed++ + m.reportUploadProgress(UploadProgress{ + DriveID: src.ID(), + State: "uploading", + QueueLength: maxInt(totalCandidates-processed, 0), + DoneCount: processed, + TotalCount: totalCandidates, + }) continue } @@ -718,6 +776,14 @@ func (m *Migrator) migrateDrive(ctx context.Context, plan migrationPlan) (int, e m.cfg.OnMigrated(v.ID) } } + processed++ + m.reportUploadProgress(UploadProgress{ + DriveID: src.ID(), + State: "uploading", + QueueLength: maxInt(totalCandidates-processed, 0), + DoneCount: processed, + TotalCount: totalCandidates, + }) continue } @@ -728,6 +794,14 @@ func (m *Migrator) migrateDrive(ctx context.Context, plan migrationPlan) (int, e continue } if !ready { + processed++ + m.reportUploadProgress(UploadProgress{ + DriveID: src.ID(), + State: "uploading", + QueueLength: maxInt(totalCandidates-processed, 0), + DoneCount: processed, + TotalCount: totalCandidates, + }) continue } } @@ -752,10 +826,25 @@ func (m *Migrator) migrateDrive(ctx context.Context, plan migrationPlan) (int, e m.cfg.OnMigrated(v.ID) } } + processed++ + m.reportUploadProgress(UploadProgress{ + DriveID: src.ID(), + State: "uploading", + QueueLength: maxInt(totalCandidates-processed, 0), + DoneCount: processed, + TotalCount: totalCandidates, + }) } return migrated, nil } +func maxInt(a, b int) int { + if a > b { + return a + } + return b +} + func (m *Migrator) findVideoForLocalFile(ctx context.Context, plan migrationPlan, localFile string, byFileID map[string]*catalog.Video) *catalog.Video { if v := byFileID[localFile]; v != nil { return v diff --git a/docs/crawler-protocol.md b/docs/crawler-protocol.md index e6abd3b..105e37c 100644 --- a/docs/crawler-protocol.md +++ b/docs/crawler-protocol.md @@ -28,7 +28,9 @@ python3 /path/to/crawler.py --job /path/to/job.json "mode": "crawl", "run_id": "20260609T120000Z", "crawler_id": "example", - "target_new": 10, + "target_new": 100, + "unique_target": 10, + "candidate_budget": 100, "seen_source_ids_file": "/data/scriptcrawlers/example/.crawl/seen.txt", "output_dir": "/data/scriptcrawlers/example/output", "config": { @@ -40,6 +42,14 @@ python3 /path/to/crawler.py --job /path/to/job.json } ``` +`unique_target` is the user's requested number of content-unique new videos. +`candidate_budget` is how many candidate items the script should emit at most. +For backward compatibility, `target_new` is set to the same value as +`candidate_budget`, because older scripts only read `target_new`. + +The backend may skip candidates that are already present by sampled content +fingerprint. Skipped duplicate candidates do not count toward `unique_target`. + ## Importing Scripts Crawler scripts are configured from the admin crawler page. A script can be @@ -118,4 +128,6 @@ Optional progress/done events: `output_dir`. - Scripts can read `seen_source_ids_file` and skip known IDs when they provide stable `source_id` values. The backend still dedupes every item. -- The backend stops the process after `target_new` new videos are imported. +- The backend stops the process after `unique_target` content-unique new videos + are imported. Scripts should stop after emitting `candidate_budget` candidates + even if the backend has not reached `unique_target`. diff --git a/src/admin/ConfirmModal.tsx b/src/admin/ConfirmModal.tsx index ec8bffc..6f73640 100644 --- a/src/admin/ConfirmModal.tsx +++ b/src/admin/ConfirmModal.tsx @@ -1,3 +1,4 @@ +import type { ReactNode } from "react"; import { AlertTriangle } from "lucide-react"; import { Modal } from "./Modal"; @@ -12,6 +13,7 @@ type ConfirmModalProps = { centerMessage?: boolean; modalClassName?: string; loading?: boolean; + children?: ReactNode; onCancel: () => void; onConfirm: () => void; }; @@ -27,6 +29,7 @@ export function ConfirmModal({ centerMessage = false, modalClassName = "", loading = false, + children, onCancel, onConfirm, }: ConfirmModalProps) { @@ -65,6 +68,7 @@ export function ConfirmModal({ ))} )} + {children} diff --git a/src/admin/CrawlersPage.tsx b/src/admin/CrawlersPage.tsx index ba8c4f1..00f26df 100644 --- a/src/admin/CrawlersPage.tsx +++ b/src/admin/CrawlersPage.tsx @@ -30,7 +30,7 @@ import { generationStateClass, generationStateLabel } from "./drive/constants"; import { Spider91UploadTargetField } from "./drive/Spider91UploadTargetField"; import { SpiderIcon } from "./icons/SpiderIcon"; -const BUSY_STATES = new Set(["scanning", "generating", "queued"]); +const BUSY_STATES = new Set(["scanning", "generating", "uploading", "queued"]); const POLL_INTERVAL_MS = 5000; const UPLOAD_TARGET_KINDS = new Set(["p115", "pikpak", "p123", "googledrive", "onedrive"]); @@ -43,7 +43,8 @@ function crawlerBusy(crawler: api.AdminCrawler) { statusBusy(crawler.scanGenerationStatus) || statusBusy(crawler.thumbnailGenerationStatus) || statusBusy(crawler.previewGenerationStatus) || - statusBusy(crawler.fingerprintGenerationStatus) + statusBusy(crawler.fingerprintGenerationStatus) || + statusBusy(crawler.uploadGenerationStatus) ); } @@ -273,12 +274,14 @@ function crawlerStages(crawler: api.AdminCrawler): StageInfo[] { { key: "thumbnail", label: "封面", status: crawler.thumbnailGenerationStatus }, { key: "preview", label: "预览", status: crawler.previewGenerationStatus }, { key: "fingerprint", label: "指纹", status: crawler.fingerprintGenerationStatus }, + { key: "upload", label: "上传", status: crawler.uploadGenerationStatus }, ]; } function stageStateLabel(stage: StageInfo): string { const state = stage.status?.state || "idle"; if (stage.key === "scan" && state === "scanning") return "抓取中"; + if (stage.key === "upload" && state === "uploading") return "上传中"; return generationStateLabel(state); } @@ -364,6 +367,7 @@ function CrawlerRow({ function CrawlerDetail({ crawler }: { crawler: api.AdminCrawler }) { const scan = crawler.scanGenerationStatus; + const upload = crawlerUploadDisplayStatus(crawler); return (
+ 确定删除「{detail.title}」吗?此操作会从管理库移除该视频。 +
+