feat: 完善爬虫去重、上传进度和源文件删除

为脚本爬虫增加候选预算、重复 source 记录和默认爬虫标签,避免重复视频占满目标新增数量。

新增爬虫上传迁移进度上报和管理页上传卡片,让每个爬虫可以展示本轮上传处理情况。

为视频删除增加可选删除云盘源文件能力,补齐播放页、管理页交互,并为多个网盘驱动实现 Remove 接口。

补充相关测试并更新爬虫协议文档。
This commit is contained in:
nianzhibai
2026-06-11 22:41:24 +08:00
parent a8ccc19e9e
commit 96e423b952
33 changed files with 1620 additions and 106 deletions
+1
View File
@@ -41,4 +41,5 @@ __pycache__/
/image003.jpg
/image004.jpg
/image005.png
/image006.png
/image02.png
+183 -13
View File
@@ -86,6 +86,7 @@ func main() {
Registry: app.registry,
GetTargetDriveID: func() string { return app.Spider91UploadDriveID() },
CommonThumbDir: app.commonThumbsDir(),
OnUploadProgress: app.updateCrawlerUploadProgress,
})
// 初始化本地内置盘;外部云盘放到 HTTP 服务启动后异步挂载,避免上游
@@ -217,8 +218,8 @@ func main() {
OnRegenFailedFingerprints: func(driveID string) {
go app.regenFailedFingerprints(ctx, driveID)
},
OnDeleteVideo: func(reqCtx context.Context, videoID string) (api.DeleteVideoResult, error) {
return app.deleteVideo(reqCtx, videoID)
OnDeleteVideo: func(reqCtx context.Context, videoID string, deleteSource bool) (api.DeleteVideoResult, error) {
return app.deleteVideo(reqCtx, videoID, deleteSource)
},
GetDriveGenerationStatuses: func() map[string]api.DriveGenerationStatuses {
return app.driveGenerationStatuses()
@@ -363,6 +364,10 @@ type App struct {
// crawlerUploadRunning 去重"保存上传目标后检查本地未上传文件"的后台任务。
crawlerUploadMu sync.Mutex
crawlerUploadRunning map[string]bool
// uploadProgress 跟踪脚本爬虫迁移到云盘时的实时上传状态。
uploadProgressMu sync.Mutex
uploadProgress map[string]driveUploadProgress
}
type driveScanProgress struct {
@@ -370,6 +375,14 @@ type driveScanProgress struct {
Added int
}
type driveUploadProgress struct {
State string
CurrentTitle string
QueueLength int
DoneCount int
TotalCount int
}
type spider91MigrationRunner interface {
RunOnce(ctx context.Context) error
}
@@ -522,6 +535,13 @@ func (a *App) driveGenerationStatuses() map[string]api.DriveGenerationStatuses {
}
a.scanQueueMu.Unlock()
a.uploadProgressMu.Lock()
uploadProgresses := make(map[string]driveUploadProgress, len(a.uploadProgress))
for id, progress := range a.uploadProgress {
uploadProgresses[id] = progress
}
a.uploadProgressMu.Unlock()
a.mu.Lock()
previewWorkers := make(map[string]*preview.Worker, len(a.workers))
for id, worker := range a.workers {
@@ -537,7 +557,7 @@ func (a *App) driveGenerationStatuses() map[string]api.DriveGenerationStatuses {
}
a.mu.Unlock()
out := make(map[string]api.DriveGenerationStatuses, len(scanningDrives)+len(previewWorkers)+len(thumbWorkers)+len(fingerprintWorkers))
out := make(map[string]api.DriveGenerationStatuses, len(scanningDrives)+len(previewWorkers)+len(thumbWorkers)+len(fingerprintWorkers)+len(uploadProgresses))
for id, running := range scanningDrives {
if !running {
continue
@@ -566,9 +586,75 @@ func (a *App) driveGenerationStatuses() map[string]api.DriveGenerationStatuses {
status.Fingerprint = generationStatusFromFingerprint(worker.Status())
out[id] = status
}
for id, progress := range uploadProgresses {
state := progress.State
if state == "" {
state = "idle"
}
status := out[id]
status.Upload = api.GenerationStatus{
State: state,
CurrentTitle: progress.CurrentTitle,
QueueLength: progress.QueueLength,
DoneCount: progress.DoneCount,
TotalCount: progress.TotalCount,
}
out[id] = status
}
return out
}
func (a *App) updateCrawlerUploadProgress(progress spider91migrate.UploadProgress) {
driveID := strings.TrimSpace(progress.DriveID)
if driveID == "" {
return
}
state := strings.TrimSpace(progress.State)
if state == "" {
state = "idle"
}
a.uploadProgressMu.Lock()
if a.uploadProgress == nil {
a.uploadProgress = make(map[string]driveUploadProgress)
}
if state == "idle" {
delete(a.uploadProgress, driveID)
a.uploadProgressMu.Unlock()
return
}
a.uploadProgress[driveID] = driveUploadProgress{
State: state,
CurrentTitle: strings.TrimSpace(progress.CurrentTitle),
QueueLength: progress.QueueLength,
DoneCount: progress.DoneCount,
TotalCount: progress.TotalCount,
}
a.uploadProgressMu.Unlock()
}
func (a *App) clearCrawlerUploadProgress(driveID string) bool {
driveID = strings.TrimSpace(driveID)
if driveID == "" {
return false
}
a.uploadProgressMu.Lock()
_, ok := a.uploadProgress[driveID]
delete(a.uploadProgress, driveID)
a.uploadProgressMu.Unlock()
return ok
}
func (a *App) clearAllCrawlerUploadProgress() []string {
a.uploadProgressMu.Lock()
ids := make([]string, 0, len(a.uploadProgress))
for id := range a.uploadProgress {
ids = append(ids, id)
}
a.uploadProgress = nil
a.uploadProgressMu.Unlock()
return ids
}
func generationStatusFromPreview(status preview.TaskStatus) api.GenerationStatus {
state := status.State
if state == "" {
@@ -905,6 +991,7 @@ func (a *App) attachScriptCrawler(d *catalog.Drive, drv *scriptcrawler.Driver) {
c := scriptcrawler.NewCrawler(scriptcrawler.CrawlerConfig{
Driver: drv,
Catalog: a.cat,
CrawlerName: d.Name,
SourceKind: sourceKind,
PythonPath: pythonPath,
ScriptPath: scriptPath,
@@ -929,6 +1016,7 @@ func (a *App) attachScriptCrawler(d *catalog.Drive, drv *scriptcrawler.Driver) {
a.scriptCrawlers[driveID] = c
a.mu.Unlock()
a.ensureScriptCrawlerNameTag(driveID, sourceKind, d.Name)
if sourceKind == spider91.Kind {
a.ensureSpider91SourceTag(driveID)
}
@@ -959,6 +1047,24 @@ func (a *App) ensureSpider91SourceTag(driveID string) {
}()
}
func (a *App) ensureScriptCrawlerNameTag(driveID, sourceKind, crawlerName string) {
tagName := strings.TrimSpace(crawlerName)
if tagName == "" {
tagName = strings.TrimSpace(driveID)
}
if tagName == "" {
return
}
bgCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
go func() {
defer cancel()
prefix := scriptcrawler.BuildVideoIDForKind(sourceKind, driveID, "")
if _, err := a.cat.EnsureTagForVideoIDPrefix(bgCtx, prefix, tagName, nil, "legacy"); err != nil {
log.Printf("[scriptcrawler] drive=%s ensure crawler tag %q: %v", driveID, tagName, err)
}
}()
}
func (a *App) registerPreviewWorkers(ctx context.Context, driveID string, worker *preview.Worker, thumbWorker *preview.ThumbWorker, fingerprintWorker *fingerprint.Worker, cancel context.CancelFunc) {
a.registerPreviewWorkersWithOptions(ctx, driveID, worker, thumbWorker, fingerprintWorker, cancel, true)
}
@@ -1185,6 +1291,13 @@ func (a *App) driveHasActiveWork(driveID string) bool {
return true
}
a.uploadProgressMu.Lock()
uploading := a.uploadProgress[driveID].State != ""
a.uploadProgressMu.Unlock()
if uploading {
return true
}
a.mu.Lock()
previewWorker := a.workers[driveID]
thumbWorker := a.thumbWorkers[driveID]
@@ -1304,10 +1417,11 @@ func (a *App) stopDriveTasks(ctx context.Context, driveID string) bool {
canceled := a.cancelDriveTaskContexts(driveID)
queued := a.clearQueuedDriveTask(driveID)
fingerprintQueued := a.clearFingerprintQueueing(driveID)
uploading := a.clearCrawlerUploadProgress(driveID)
hadWorkers := a.resetDriveGenerationWorkers(ctx, driveID)
stopped := canceled > 0 || queued || fingerprintQueued || hadWorkers
log.Printf("[tasks] stop drive=%s stopped=%v canceled_tasks=%d queued=%v fingerprint_queue=%v workers=%v",
driveID, stopped, canceled, queued, fingerprintQueued, hadWorkers)
stopped := canceled > 0 || queued || fingerprintQueued || uploading || hadWorkers
log.Printf("[tasks] stop drive=%s stopped=%v canceled_tasks=%d queued=%v fingerprint_queue=%v uploading=%v workers=%v",
driveID, stopped, canceled, queued, fingerprintQueued, uploading, hadWorkers)
return stopped
}
@@ -1325,6 +1439,9 @@ func (a *App) stopAllDriveTasks(ctx context.Context) int {
for _, id := range a.clearAllFingerprintQueueing() {
stoppedIDs[id] = struct{}{}
}
for _, id := range a.clearAllCrawlerUploadProgress() {
stoppedIDs[id] = struct{}{}
}
for _, id := range a.resetAllDriveGenerationWorkers(ctx) {
stoppedIDs[id] = struct{}{}
}
@@ -1679,7 +1796,7 @@ func (a *App) cleanupMissingDriveVideos(ctx context.Context, driveID string, liv
return removed, nil
}
func (a *App) deleteVideo(ctx context.Context, videoID string) (api.DeleteVideoResult, error) {
func (a *App) deleteVideo(ctx context.Context, videoID string, deleteSource bool) (api.DeleteVideoResult, error) {
if a == nil || a.cat == nil {
return api.DeleteVideoResult{}, sql.ErrNoRows
}
@@ -1688,6 +1805,14 @@ func (a *App) deleteVideo(ctx context.Context, videoID string) (api.DeleteVideoR
return api.DeleteVideoResult{}, err
}
deletedSource := false
if deleteSource {
deletedSource, err = a.removeVideoSourceFile(ctx, v)
if err != nil {
return api.DeleteVideoResult{}, err
}
}
localDir := ""
if a.cfg != nil {
localDir = a.cfg.Storage.LocalPreviewDir
@@ -1695,16 +1820,61 @@ func (a *App) deleteVideo(ctx context.Context, videoID string) (api.DeleteVideoR
if err := removeLocalVideoAssets(localDir, v); err != nil {
return api.DeleteVideoResult{}, fmt.Errorf("remove local assets for %s: %w", v.ID, err)
}
deletedSource, err := a.removeSpider91SourceFile(ctx, v)
if err != nil {
return api.DeleteVideoResult{}, err
}
if err := a.cat.DeleteVideoWithTombstone(ctx, v.ID); err != nil {
return api.DeleteVideoResult{}, err
}
return api.DeleteVideoResult{OK: true, DeletedSource: deletedSource}, nil
}
func (a *App) removeVideoSourceFile(ctx context.Context, v *catalog.Video) (bool, error) {
if v == nil {
return false, errors.New("remove video source: empty video")
}
if a == nil {
return false, fmt.Errorf("remove video source %s: app unavailable: %w", v.ID, drives.ErrNotSupported)
}
if strings.HasPrefix(v.ID, "spider91-") {
deleted, err := a.removeSpider91SourceFile(ctx, v)
if err != nil || deleted {
return deleted, err
}
if a.cat != nil {
if drive, driveErr := a.cat.GetDrive(ctx, v.DriveID); driveErr == nil && drive.Kind == spider91.Kind {
return false, nil
}
} else if strings.HasPrefix(v.ID, "spider91-"+v.DriveID+"-") {
return false, nil
}
}
fileID := strings.TrimSpace(v.FileID)
if fileID == "" {
return false, fmt.Errorf("remove video source %s: empty file id", v.ID)
}
if a == nil || a.registry == nil {
return false, fmt.Errorf("remove video source %s: drive registry unavailable: %w", v.ID, drives.ErrNotSupported)
}
if _, ok := a.registry.Get(v.DriveID); !ok {
if a.cat == nil {
return false, fmt.Errorf("remove video source %s: drive %s not attached: %w", v.ID, v.DriveID, drives.ErrNotSupported)
}
if err := a.ensureDriveAttached(ctx, v.DriveID); err != nil {
return false, fmt.Errorf("remove video source %s: attach drive %s: %w", v.ID, v.DriveID, err)
}
}
drv, ok := a.registry.Get(v.DriveID)
if !ok {
return false, fmt.Errorf("remove video source %s: drive %s not attached: %w", v.ID, v.DriveID, drives.ErrNotSupported)
}
remover, ok := drv.(drives.Remover)
if !ok {
return false, fmt.Errorf("remove video source %s: drive %s (%s) does not support source deletion: %w", v.ID, v.DriveID, drv.Kind(), drives.ErrNotSupported)
}
if err := remover.Remove(ctx, fileID); err != nil {
return false, fmt.Errorf("remove video source %s from drive %s: %w", v.ID, v.DriveID, err)
}
return true, nil
}
func (a *App) removeSpider91SourceFile(ctx context.Context, v *catalog.Video) (bool, error) {
if a == nil || a.cfg == nil || v == nil || !strings.HasPrefix(v.ID, "spider91-") {
return false, nil
@@ -2642,8 +2812,8 @@ func (a *App) runScriptCrawlerCrawlWithTaskContext(ctx context.Context, driveID
if runErr != nil {
log.Printf("[scriptcrawler] drive=%s crawl failed: %v", driveID, runErr)
} else if res != nil {
log.Printf("[scriptcrawler] drive=%s crawl done target=%d total=%d new=%d skipped=%d failed=%d seen_snapshot=%d",
driveID, res.TargetNew, res.TotalEntries, res.NewVideos, res.Skipped, res.Failed, res.SeenSnapshot)
log.Printf("[scriptcrawler] drive=%s crawl done target=%d candidate_budget=%d total=%d new=%d skipped=%d failed=%d seen_snapshot=%d",
driveID, res.TargetNew, res.CandidateBudget, res.TotalEntries, res.NewVideos, res.Skipped, res.Failed, res.SeenSnapshot)
}
if d.Credentials == nil {
+85 -2
View File
@@ -1243,7 +1243,7 @@ func TestDeleteVideoRemovesGeneratedAssetsKeepsLocalOriginalAndTombstones(t *tes
cfg: &config.Config{Storage: config.Storage{LocalPreviewDir: localDir}},
cat: cat,
}
result, err := app.deleteVideo(ctx, "localstorage-local-main-file")
result, err := app.deleteVideo(ctx, "localstorage-local-main-file", false)
if err != nil {
t.Fatalf("delete video: %v", err)
}
@@ -1270,6 +1270,73 @@ func TestDeleteVideoRemovesGeneratedAssetsKeepsLocalOriginalAndTombstones(t *tes
}
}
func TestDeleteVideoRemovesSourceFileWhenRequested(t *testing.T) {
ctx := context.Background()
root := t.TempDir()
localDir := filepath.Join(root, "previews")
cat, err := catalog.Open(filepath.Join(t.TempDir(), "catalog.db"))
if err != nil {
t.Fatalf("open catalog: %v", err)
}
t.Cleanup(func() { _ = cat.Close() })
previewPath := filepath.Join(localDir, "video-with-source.mp4")
thumbPath := filepath.Join(localDir, "thumbs", "video-with-source.jpg")
for _, path := range []string{previewPath, thumbPath} {
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
t.Fatalf("mkdir %s: %v", path, err)
}
if err := os.WriteFile(path, []byte("file"), 0o644); err != nil {
t.Fatalf("write %s: %v", path, err)
}
}
now := time.Now()
if err := cat.UpsertVideo(ctx, &catalog.Video{
ID: "video-with-source",
DriveID: "source-drive",
FileID: "source-file",
FileName: "clip.mp4",
Title: "Source File",
PreviewLocal: previewPath,
PreviewStatus: "ready",
ThumbnailURL: "/p/thumb/video-with-source",
Size: 123,
PublishedAt: now,
CreatedAt: now,
UpdatedAt: now,
}); err != nil {
t.Fatalf("seed video: %v", err)
}
registry := proxy.NewRegistry()
drv := &serverRemovableFakeDrive{id: "source-drive"}
registry.Set(drv.ID(), drv)
app := &App{
cfg: &config.Config{Storage: config.Storage{LocalPreviewDir: localDir}},
cat: cat,
registry: registry,
}
result, err := app.deleteVideo(ctx, "video-with-source", true)
if err != nil {
t.Fatalf("delete video: %v", err)
}
if !result.OK || !result.DeletedSource {
t.Fatalf("delete result = %#v, want source deleted", result)
}
if got, want := drv.removedFileID, "source-file"; got != want {
t.Fatalf("removed source fileID = %q, want %q", got, want)
}
if _, err := cat.GetVideo(ctx, "video-with-source"); err != sql.ErrNoRows {
t.Fatalf("deleted video lookup error = %v, want sql.ErrNoRows", err)
}
for _, path := range []string{previewPath, thumbPath} {
if _, err := os.Stat(path); !os.IsNotExist(err) {
t.Fatalf("generated asset %s still exists, stat err=%v", path, err)
}
}
}
func TestDeleteVideoRemovesSpider91SourceFile(t *testing.T) {
ctx := context.Background()
root := t.TempDir()
@@ -1326,7 +1393,7 @@ func TestDeleteVideoRemovesSpider91SourceFile(t *testing.T) {
t.Fatalf("seed video: %v", err)
}
result, err := app.deleteVideo(ctx, "spider91-spider-main-source")
result, err := app.deleteVideo(ctx, "spider91-spider-main-source", true)
if err != nil {
t.Fatalf("delete spider video: %v", err)
}
@@ -1740,6 +1807,22 @@ type serverFakeKindDrive struct {
func (d *serverFakeKindDrive) Kind() string { return d.kind }
func (d *serverFakeKindDrive) ID() string { return d.id }
type serverRemovableFakeDrive struct {
serverFakeDrive
id string
removedFileID string
}
func (d *serverRemovableFakeDrive) Kind() string { return "fake-removable" }
func (d *serverRemovableFakeDrive) ID() string { return d.id }
func (d *serverRemovableFakeDrive) Remove(ctx context.Context, fileID string) error {
if err := ctx.Err(); err != nil {
return err
}
d.removedFileID = fileID
return nil
}
type serverFakeSpider91MigrationRunner struct {
called int
}
+23 -2
View File
@@ -57,7 +57,7 @@ type AdminServer struct {
OnRegenFailedPreviews func(driveID string)
OnRegenFailedThumbnails func(driveID string)
OnRegenFailedFingerprints func(driveID string)
OnDeleteVideo func(ctx context.Context, videoID string) (DeleteVideoResult, error)
OnDeleteVideo func(ctx context.Context, videoID string, deleteSource bool) (DeleteVideoResult, error)
GetDriveGenerationStatuses func() map[string]DriveGenerationStatuses
// OnTeaserEnabledChanged 在 per-drive 预览视频开关被切换后调用。
// enabled=true 时上层应该重新把 pending 预览视频入队(类似旧的全局开关从关到开);
@@ -103,6 +103,8 @@ type GenerationStatus struct {
CooldownUntil string `json:"cooldownUntil,omitempty"`
ScannedCount int `json:"scannedCount"`
AddedCount int `json:"addedCount"`
DoneCount int `json:"doneCount"`
TotalCount int `json:"totalCount"`
}
type DriveGenerationStatuses struct {
@@ -110,6 +112,7 @@ type DriveGenerationStatuses struct {
Thumbnail GenerationStatus `json:"thumbnail"`
Preview GenerationStatus `json:"preview"`
Fingerprint GenerationStatus `json:"fingerprint"`
Upload GenerationStatus `json:"upload"`
}
type NightlyJobStatus struct {
@@ -127,6 +130,10 @@ type DeleteVideoResult struct {
DeletedSource bool `json:"deletedSource"`
}
type deleteVideoReq struct {
DeleteSource bool `json:"deleteSource"`
}
func (a *AdminServer) Register(r chi.Router) {
r.Route("/admin/api", func(r chi.Router) {
// 登录、登出和首次部署初始化不需要鉴权
@@ -637,6 +644,7 @@ type crawlerDTO struct {
ThumbnailGenerationStatus GenerationStatus `json:"thumbnailGenerationStatus"`
PreviewGenerationStatus GenerationStatus `json:"previewGenerationStatus"`
FingerprintGenerationStatus GenerationStatus `json:"fingerprintGenerationStatus"`
UploadGenerationStatus GenerationStatus `json:"uploadGenerationStatus"`
ThumbnailReadyCount int `json:"thumbnailReadyCount"`
ThumbnailPendingCount int `json:"thumbnailPendingCount"`
ThumbnailFailedCount int `json:"thumbnailFailedCount"`
@@ -698,6 +706,9 @@ func (a *AdminServer) crawlerDTOForDrive(d *catalog.Drive, assets catalog.Crawle
if generation.Fingerprint.State == "" {
generation.Fingerprint.State = "idle"
}
if generation.Upload.State == "" {
generation.Upload.State = "idle"
}
lastCrawlAt := int64(0)
if raw := strings.TrimSpace(d.Credentials["last_crawl_at"]); raw != "" {
if v, err := strconv.ParseInt(raw, 10, 64); err == nil {
@@ -719,6 +730,7 @@ func (a *AdminServer) crawlerDTOForDrive(d *catalog.Drive, assets catalog.Crawle
ThumbnailGenerationStatus: generation.Thumbnail,
PreviewGenerationStatus: generation.Preview,
FingerprintGenerationStatus: generation.Fingerprint,
UploadGenerationStatus: generation.Upload,
ThumbnailReadyCount: assets.Thumbnail.Ready,
ThumbnailPendingCount: assets.Thumbnail.Pending,
ThumbnailFailedCount: assets.Thumbnail.Failed,
@@ -1824,12 +1836,21 @@ func (a *AdminServer) handleDeleteVideo(w http.ResponseWriter, r *http.Request)
writeErr(w, http.StatusBadRequest, errors.New("invalid video id"))
return
}
var body deleteVideoReq
if r.Body != nil {
defer r.Body.Close()
decoder := json.NewDecoder(r.Body)
if err := decoder.Decode(&body); err != nil && !errors.Is(err, io.EOF) {
writeErr(w, http.StatusBadRequest, err)
return
}
}
var (
result DeleteVideoResult
err error
)
if a.OnDeleteVideo != nil {
result, err = a.OnDeleteVideo(r.Context(), id)
result, err = a.OnDeleteVideo(r.Context(), id, body.DeleteSource)
} else {
err = a.Catalog.DeleteVideoWithTombstone(r.Context(), id)
result = DeleteVideoResult{OK: err == nil}
+57
View File
@@ -121,6 +121,63 @@ func TestHandleSetupStoresCredentialsAndCreatesSession(t *testing.T) {
}
}
func TestHandleDeleteVideoDefaultsDeleteSourceFalse(t *testing.T) {
req := httptest.NewRequest(http.MethodDelete, "/admin/api/videos/video-1", nil)
rctx := chi.NewRouteContext()
rctx.URLParams.Add("id", "video-1")
req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
rr := httptest.NewRecorder()
called := false
(&AdminServer{
OnDeleteVideo: func(ctx context.Context, videoID string, deleteSource bool) (DeleteVideoResult, error) {
called = true
if videoID != "video-1" {
t.Fatalf("videoID = %q, want video-1", videoID)
}
if deleteSource {
t.Fatal("deleteSource defaulted to true")
}
return DeleteVideoResult{OK: true}, nil
},
}).handleDeleteVideo(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, want 200; body = %s", rr.Code, rr.Body.String())
}
if !called {
t.Fatal("OnDeleteVideo was not called")
}
}
func TestHandleDeleteVideoPassesDeleteSourceOption(t *testing.T) {
req := httptest.NewRequest(http.MethodDelete, "/admin/api/videos/video-1", strings.NewReader(`{"deleteSource":true}`))
rctx := chi.NewRouteContext()
rctx.URLParams.Add("id", "video-1")
req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
rr := httptest.NewRecorder()
(&AdminServer{
OnDeleteVideo: func(ctx context.Context, videoID string, deleteSource bool) (DeleteVideoResult, error) {
if !deleteSource {
t.Fatal("deleteSource = false, want true")
}
return DeleteVideoResult{OK: true, DeletedSource: true}, nil
},
}).handleDeleteVideo(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, want 200; body = %s", rr.Code, rr.Body.String())
}
var got DeleteVideoResult
if err := json.Unmarshal(rr.Body.Bytes(), &got); err != nil {
t.Fatalf("decode response: %v", err)
}
if !got.DeletedSource {
t.Fatalf("DeletedSource = false, want true; response = %s", rr.Body.String())
}
}
func TestHandleCheckUpdateReportsNewRelease(t *testing.T) {
dir := t.TempDir()
versionFile := filepath.Join(dir, ".version")
+83 -8
View File
@@ -88,6 +88,11 @@ type Video struct {
func (c *Catalog) UpsertVideo(ctx context.Context, v *Video) error {
existed := c.videoExists(ctx, v.ID)
v.ContentHash = normalizeContentHash(v.ContentHash)
v.SampledSHA256 = normalizeContentHash(v.SampledSHA256)
fingerprintStatus := nullableStatus(v.FingerprintStatus)
if v.SampledSHA256 != "" && (v.FingerprintStatus == "" || v.FingerprintStatus == "pending") {
fingerprintStatus = "ready"
}
tagsJSON, _ := json.Marshal(v.Tags)
badgesJSON, _ := json.Marshal(v.Badges)
now := time.Now().UnixMilli()
@@ -98,13 +103,13 @@ func (c *Catalog) UpsertVideo(ctx context.Context, v *Video) error {
_, err := c.db.ExecContext(ctx, `
INSERT INTO videos (
id, drive_id, file_id, file_name, content_hash, parent_id, title, author, tags,
id, drive_id, file_id, file_name, content_hash, sampled_sha256, fingerprint_status, fingerprint_error, parent_id, title, author, tags,
duration_seconds, size_bytes, ext, quality, thumbnail_url, thumbnail_status,
preview_file_id, preview_local, preview_status,
views, favorites, comments, likes, dislikes,
category, hidden, badges, description, published_at, created_at, updated_at
) VALUES (
?, ?, ?, ?, ?, ?, ?, ?, ?,
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
?, ?, ?, ?, ?, CASE WHEN COALESCE(?, '') != '' THEN 'ready' ELSE 'pending' END,
?, ?, ?,
?, ?, ?, ?, ?,
@@ -123,15 +128,18 @@ ON CONFLICT(id) DO UPDATE SET
ELSE videos.content_hash
END,
sampled_sha256 = CASE
WHEN videos.size_bytes != excluded.size_bytes THEN ''
WHEN videos.size_bytes != excluded.size_bytes THEN excluded.sampled_sha256
WHEN excluded.sampled_sha256 != '' THEN excluded.sampled_sha256
ELSE videos.sampled_sha256
END,
fingerprint_status = CASE
WHEN videos.size_bytes != excluded.size_bytes THEN 'pending'
WHEN videos.size_bytes != excluded.size_bytes THEN COALESCE(excluded.fingerprint_status, 'pending')
WHEN excluded.sampled_sha256 != '' THEN COALESCE(excluded.fingerprint_status, 'ready')
ELSE COALESCE(videos.fingerprint_status, 'pending')
END,
fingerprint_error = CASE
WHEN videos.size_bytes != excluded.size_bytes THEN ''
WHEN videos.size_bytes != excluded.size_bytes THEN COALESCE(excluded.fingerprint_error, '')
WHEN excluded.sampled_sha256 != '' THEN COALESCE(excluded.fingerprint_error, '')
ELSE COALESCE(videos.fingerprint_error, '')
END,
duration_seconds= excluded.duration_seconds,
@@ -152,7 +160,7 @@ ON CONFLICT(id) DO UPDATE SET
description = excluded.description,
updated_at = excluded.updated_at
`,
v.ID, v.DriveID, v.FileID, v.FileName, v.ContentHash, v.ParentID, v.Title, v.Author, string(tagsJSON),
v.ID, v.DriveID, v.FileID, v.FileName, v.ContentHash, v.SampledSHA256, fingerprintStatus, v.FingerprintError, v.ParentID, v.Title, v.Author, string(tagsJSON),
v.DurationSeconds, v.Size, v.Ext, v.Quality, v.ThumbnailURL, v.ThumbnailURL,
v.PreviewFileID, v.PreviewLocal, nullableStatus(v.PreviewStatus),
v.Views, v.Favorites, v.Comments, v.Likes, v.Dislikes,
@@ -731,8 +739,11 @@ func (c *Catalog) ListCrawlerSourceIDs(ctx context.Context, kind, driveID string
rows, err := c.db.QueryContext(ctx,
`SELECT SUBSTR(id, ?) FROM videos WHERE id LIKE ? || '%'
UNION
SELECT SUBSTR(id, ?) FROM deleted_videos WHERE id LIKE ? || '%'`,
len(prefix)+1, prefix, len(prefix)+1, prefix)
SELECT SUBSTR(id, ?) FROM deleted_videos WHERE id LIKE ? || '%'
UNION
SELECT source_id FROM crawler_seen_sources
WHERE kind = ? AND drive_id = ? AND status IN ('imported', 'duplicate')`,
len(prefix)+1, prefix, len(prefix)+1, prefix, kind, driveID)
if err != nil {
return nil, err
}
@@ -750,6 +761,47 @@ func (c *Catalog) ListCrawlerSourceIDs(ctx context.Context, kind, driveID string
return out, rows.Err()
}
// MarkCrawlerSourceSeen records the outcome for a crawler source item. Duplicate
// source IDs are included in future seen files so scripts can skip them before
// the backend downloads the same duplicate content again.
func (c *Catalog) MarkCrawlerSourceSeen(ctx context.Context, kind, driveID, sourceID, status, canonicalVideoID, sampledSHA256 string, size int64) error {
kind = strings.TrimSpace(kind)
driveID = strings.TrimSpace(driveID)
sourceID = strings.TrimSpace(sourceID)
status = strings.TrimSpace(status)
if kind == "" || driveID == "" || sourceID == "" {
return nil
}
switch status {
case "imported", "duplicate":
default:
return fmt.Errorf("catalog: unsupported crawler source status %q", status)
}
sampledSHA256 = normalizeContentHash(sampledSHA256)
if size < 0 {
size = 0
}
now := time.Now().UnixMilli()
_, err := c.db.ExecContext(ctx, `
INSERT INTO crawler_seen_sources (
kind, drive_id, source_id, status, canonical_video_id, sampled_sha256, size_bytes, first_seen_at, last_seen_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(kind, drive_id, source_id) DO UPDATE SET
status = excluded.status,
canonical_video_id = excluded.canonical_video_id,
sampled_sha256 = CASE
WHEN excluded.sampled_sha256 != '' THEN excluded.sampled_sha256
ELSE crawler_seen_sources.sampled_sha256
END,
size_bytes = CASE
WHEN excluded.size_bytes > 0 THEN excluded.size_bytes
ELSE crawler_seen_sources.size_bytes
END,
last_seen_at = excluded.last_seen_at`,
kind, driveID, sourceID, status, strings.TrimSpace(canonicalVideoID), sampledSHA256, size, now, now)
return err
}
// DeleteVideoWithTombstone records that an administrator explicitly deleted a
// video, then removes the visible catalog row. The tombstone is used by
// scanners/crawlers to avoid importing the same source file again.
@@ -922,6 +974,29 @@ func (c *Catalog) FindVideoByFileSignature(ctx context.Context, fileName string,
return scanVideo(row)
}
// FindEquivalentVideo returns the earliest visible video that represents the
// same content as source by strong hash or sampled fingerprint, regardless of
// which drive currently owns it.
func (c *Catalog) FindEquivalentVideo(ctx context.Context, source *Video) (*Video, error) {
if source == nil {
return nil, sql.ErrNoRows
}
where, args, ok := equivalentVideoLookupWhere(source)
if !ok {
return nil, sql.ErrNoRows
}
args = append([]any{source.ID}, args...)
row := c.db.QueryRowContext(ctx,
`SELECT `+allVideoCols+` FROM videos
WHERE id != ?
AND COALESCE(hidden, 0) = 0
AND COALESCE(file_id, '') != ''
AND (`+where+`)
ORDER BY created_at ASC, id ASC
LIMIT 1`, args...)
return scanVideo(row)
}
// FindEquivalentVideoOnDrive returns a visible video on driveID that represents
// the same content as source by strong hash or sampled fingerprint.
func (c *Catalog) FindEquivalentVideoOnDrive(ctx context.Context, source *Video, driveID string) (*Video, error) {
+18
View File
@@ -89,6 +89,24 @@ CREATE INDEX IF NOT EXISTS idx_deleted_videos_drive_hash
CREATE INDEX IF NOT EXISTS idx_deleted_videos_drive_signature
ON deleted_videos(drive_id, file_name, size_bytes);
-- 爬虫来源记录。用于把已确认重复的 source_id 写回 seen 列表,
-- 避免后续爬虫反复下载同一个候选视频。
CREATE TABLE IF NOT EXISTS crawler_seen_sources (
kind TEXT NOT NULL,
drive_id TEXT NOT NULL,
source_id TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'imported', -- imported / duplicate
canonical_video_id TEXT NOT NULL DEFAULT '',
sampled_sha256 TEXT NOT NULL DEFAULT '',
size_bytes INTEGER NOT NULL DEFAULT 0,
first_seen_at INTEGER NOT NULL,
last_seen_at INTEGER NOT NULL,
PRIMARY KEY (kind, drive_id, source_id)
);
CREATE INDEX IF NOT EXISTS idx_crawler_seen_sources_drive
ON crawler_seen_sources(kind, drive_id, status);
-- 网盘账户
CREATE TABLE IF NOT EXISTS drives (
id TEXT PRIMARY KEY,
@@ -593,6 +593,17 @@ func (d *Driver) Rename(ctx context.Context, fileID, newName string) error {
return nil
}
func (d *Driver) Remove(ctx context.Context, fileID string) error {
fileID = strings.TrimSpace(fileID)
if fileID == "" {
return errors.New("googledrive remove: empty file id")
}
if err := d.request(ctx, d.fileURL(fileID), http.MethodDelete, nil, nil); err != nil {
return fmt.Errorf("googledrive remove: %w", err)
}
return nil
}
func (d *Driver) findUploadedFileID(ctx context.Context, parentID, name, md5Hex string) (string, error) {
entries, err := d.List(ctx, parentID)
if err != nil {
@@ -624,6 +635,8 @@ func (d *Driver) findUploadedFileID(ctx context.Context, parentID, name, md5Hex
return "", fmt.Errorf("googledrive upload: uploaded file %q not found in parent %q", name, parentID)
}
var _ drives.Remover = (*Driver)(nil)
func isGoogleUploadHTTPRateLimit(status int, header http.Header, body []byte, apiErr apiErrorBody) bool {
if status == http.StatusTooManyRequests {
return true
+6
View File
@@ -40,6 +40,12 @@ type Drive interface {
RootID() string
}
// Remover is an optional drive capability. It mirrors OpenList's optional
// Remove interface: callers must type-assert before deleting a source file.
type Remover interface {
Remove(ctx context.Context, fileID string) error
}
type Entry struct {
ID string
Name string
@@ -257,6 +257,39 @@ func (d *Driver) EnsureDir(context.Context, string) (string, error) {
return "", drives.ErrNotSupported
}
func (d *Driver) Remove(ctx context.Context, fileID string) error {
if err := ctx.Err(); err != nil {
return err
}
p, rel, err := d.pathForID(fileID)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
if rel == "" {
return errors.New("localstorage: refusing to remove root")
}
info, err := os.Stat(p)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
if info.IsDir() {
return errors.New("localstorage: refusing to remove directory")
}
if !info.Mode().IsRegular() {
return errors.New("localstorage: refusing to remove non-regular file")
}
if err := os.Remove(p); err != nil && !os.IsNotExist(err) {
return err
}
return nil
}
func (d *Driver) root() (string, error) {
raw := strings.TrimSpace(d.rootPath)
if raw == "" {
@@ -276,6 +309,8 @@ func (d *Driver) root() (string, error) {
return filepath.Abs(raw)
}
var _ drives.Remover = (*Driver)(nil)
func (d *Driver) pathForID(id string) (string, string, error) {
root, err := d.root()
if err != nil {
@@ -78,12 +78,38 @@ func (d *Driver) EnsureDir(context.Context, string) (string, error) {
return "", drives.ErrNotSupported
}
func (d *Driver) Remove(ctx context.Context, fileID string) error {
if err := ctx.Err(); err != nil {
return err
}
path, err := d.uploadPath(fileID)
if err != nil {
return err
}
info, err := os.Stat(path)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
if info.IsDir() {
return errors.New("localupload: refusing to remove directory")
}
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
return err
}
return nil
}
func (d *Driver) RootID() string { return d.uploadDir() }
func (d *Driver) uploadDir() string {
return d.uploadDirPath
}
var _ drives.Remover = (*Driver)(nil)
func (d *Driver) uploadPath(fileID string) (string, error) {
if strings.TrimSpace(fileID) == "" || filepath.Base(fileID) != fileID {
return "", errors.New("invalid upload file id")
@@ -501,6 +501,17 @@ func (d *Driver) Rename(ctx context.Context, fileID, newName string) error {
return nil
}
func (d *Driver) Remove(ctx context.Context, fileID string) error {
fileID = strings.TrimSpace(fileID)
if fileID == "" {
return errors.New("onedrive remove: empty file id")
}
if err := d.request(ctx, d.itemURL(fileID), http.MethodDelete, nil, nil); err != nil {
return fmt.Errorf("onedrive remove: %w", err)
}
return nil
}
func (d *Driver) request(ctx context.Context, rawURL, method string, configure func(*resty.Request), out any) error {
return d.requestOnce(ctx, rawURL, method, configure, out, true)
}
@@ -741,3 +752,4 @@ func guessMime(name string) string {
}
var _ drives.Drive = (*Driver)(nil)
var _ drives.Remover = (*Driver)(nil)
+18
View File
@@ -461,6 +461,23 @@ func (d *Driver) Rename(ctx context.Context, fileID, newName string) error {
return nil
}
func (d *Driver) Remove(ctx context.Context, fileID string) error {
if d.client == nil {
return errors.New("p115 remove: driver not initialized")
}
if err := ctx.Err(); err != nil {
return err
}
fileID = strings.TrimSpace(fileID)
if fileID == "" {
return errors.New("p115 remove: empty fileID")
}
if err := d.client.Delete(fileID); err != nil {
return fmt.Errorf("p115 remove: %w", err)
}
return nil
}
// bufferAndHashSha1 把 r 全量复制到一个临时文件,同时计算 SHA1。
// 返回临时文件(位置在末尾,需调用方 Seek 回 0)、SHA1 hex 大写、实际字节数。
//
@@ -563,3 +580,4 @@ func guessMime(name string) string {
}
var _ drives.Drive = (*Driver)(nil)
var _ drives.Remover = (*Driver)(nil)
+34
View File
@@ -42,6 +42,7 @@ const (
endpointDownloadInfo = "/file/download_info"
endpointMkdir = "/file/upload_request"
endpointRename = "/file/rename"
endpointTrash = "/file/trash"
endpointUpload = "/file/upload_request"
endpointS3Auth = "/file/s3_upload_object/auth"
endpointS3Parts = "/file/s3_repare_upload_parts_batch"
@@ -545,6 +546,32 @@ func (d *Driver) Rename(ctx context.Context, fileID, newName string) error {
return nil
}
func (d *Driver) Remove(ctx context.Context, fileID string) error {
fileID = strings.TrimSpace(fileID)
if fileID == "" {
return errors.New("123pan remove: empty file id")
}
f, _, err := d.findFile(ctx, fileID)
if err != nil {
if strings.Contains(strings.ToLower(err.Error()), "not found") {
return nil
}
return fmt.Errorf("123pan remove metadata: %w", err)
}
body := map[string]any{
"driveId": 0,
"operation": true,
"fileTrashInfoList": []panFile{f},
}
if _, err := d.request(ctx, endpointTrash, http.MethodPost, func(req *resty.Request) {
req.SetBody(body)
}, nil); err != nil {
return fmt.Errorf("123pan remove: %w", err)
}
d.removeCachedFile(fileID)
return nil
}
func (d *Driver) EnsureDir(ctx context.Context, pathFromRoot string) (string, error) {
parts := splitPath(pathFromRoot)
currentID := d.rootID
@@ -942,6 +969,12 @@ func (d *Driver) renameCachedFile(fileID, newName string) {
}
}
func (d *Driver) removeCachedFile(fileID string) {
d.fileMu.Lock()
delete(d.files, fileID)
d.fileMu.Unlock()
}
func (d *Driver) cachedFile(fileID string) (panFile, string, bool) {
d.fileMu.RLock()
defer d.fileMu.RUnlock()
@@ -1111,3 +1144,4 @@ func guessMime(name string) string {
}
var _ drives.Drive = (*Driver)(nil)
var _ drives.Remover = (*Driver)(nil)
+14
View File
@@ -356,6 +356,19 @@ func (d *Driver) Rename(ctx context.Context, fileID, newName string) error {
return nil
}
func (d *Driver) Remove(ctx context.Context, fileID string) error {
fileID = strings.TrimSpace(fileID)
if fileID == "" {
return errors.New("pikpak remove: empty file id")
}
if err := d.request(ctx, filesURL+":batchTrash", http.MethodPost, func(req *resty.Request) {
req.SetBody(map[string]any{"ids": []string{fileID}})
}, nil); err != nil {
return fmt.Errorf("pikpak remove: %w", err)
}
return nil
}
func (d *Driver) EnsureDir(ctx context.Context, pathFromRoot string) (string, error) {
currentID := d.rootID
for _, name := range splitPath(pathFromRoot) {
@@ -565,3 +578,4 @@ func ParseBoolDefault(raw string, def bool) bool {
}
var _ drives.Drive = (*Driver)(nil)
var _ drives.Remover = (*Driver)(nil)
+17
View File
@@ -269,6 +269,22 @@ func (d *Driver) Upload(ctx context.Context, parentID, name string, r io.Reader,
return "", drives.ErrNotSupported
}
func (d *Driver) Remove(ctx context.Context, fileID string) error {
fileID = strings.TrimSpace(fileID)
if fileID == "" {
return errors.New("quark remove: empty file id")
}
body := map[string]any{
"action_type": 1,
"exclude_fids": []string{},
"filelist": []string{fileID},
}
if err := d.request(ctx, "/file/delete", http.MethodPost, nil, body, nil); err != nil {
return fmt.Errorf("quark remove: %w", err)
}
return nil
}
// ---------- helpers ----------
func fileToEntry(f *file, parentID string) drives.Entry {
@@ -343,3 +359,4 @@ func setCookieValue(cookie, key, value string) string {
}
var _ drives.Drive = (*Driver)(nil)
var _ drives.Remover = (*Driver)(nil)
@@ -4,6 +4,7 @@ import (
"bufio"
"context"
"crypto/sha256"
"database/sql"
"encoding/hex"
"encoding/json"
"errors"
@@ -23,6 +24,7 @@ import (
"time"
"github.com/video-site/backend/internal/catalog"
"github.com/video-site/backend/internal/fingerprint"
"github.com/video-site/backend/internal/mediaasset"
"golang.org/x/net/proxy"
)
@@ -30,11 +32,15 @@ import (
const (
DefaultTargetNew = 10
defaultUserAgent = "Mozilla/5.0 (compatible; video-site-91-scriptcrawler/1.0)"
defaultCandidateMultiplier = 10
defaultCandidateFloorExtra = 50
defaultCandidateBudgetMax = 500
)
type CrawlerConfig struct {
Driver *Driver
Catalog *catalog.Catalog
CrawlerName string
SourceKind string
PythonPath string
ScriptPath string
@@ -76,6 +82,7 @@ func NewCrawler(cfg CrawlerConfig) *Crawler {
type CrawlResult struct {
TargetNew int
CandidateBudget int
TotalEntries int
NewVideos int
Skipped int
@@ -105,6 +112,8 @@ type Job struct {
RunID string `json:"run_id"`
CrawlerID string `json:"crawler_id"`
TargetNew int `json:"target_new"`
UniqueTarget int `json:"unique_target,omitempty"`
CandidateBudget int `json:"candidate_budget,omitempty"`
SeenSourceIDsFile string `json:"seen_source_ids_file"`
OutputDir string `json:"output_dir"`
Config json.RawMessage `json:"config"`
@@ -253,11 +262,12 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err
if targetNew <= 0 {
targetNew = DefaultTargetNew
}
candidateBudget := candidateBudgetForTarget(targetNew)
if err := c.cfg.Driver.Init(ctx); err != nil {
return nil, fmt.Errorf("scriptcrawler: driver init: %w", err)
}
result := &CrawlResult{TargetNew: targetNew, StartedAt: time.Now()}
result := &CrawlResult{TargetNew: targetNew, CandidateBudget: candidateBudget, StartedAt: time.Now()}
defer func() { result.FinishedAt = time.Now() }()
emit := func(p CrawlProgress) {
if c.cfg.OnProgress == nil {
@@ -293,11 +303,11 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err
result.SeenSnapshot = seenCount
emit(CrawlProgress{})
if err := c.writeJobFile(jobPath, runID, targetNew, seenPath); err != nil {
if err := c.writeJobFile(jobPath, runID, targetNew, candidateBudget, seenPath); err != nil {
return result, fmt.Errorf("scriptcrawler: write job: %w", err)
}
cmd, stdout, err := c.startScript(ctx, jobPath, targetNew)
cmd, stdout, err := c.startScript(ctx, jobPath, targetNew, candidateBudget)
if err != nil {
return result, fmt.Errorf("scriptcrawler: start: %w", err)
}
@@ -408,7 +418,7 @@ func (c *Crawler) writeSeenSourceIDs(ctx context.Context, path string) (int, err
return len(seen), nil
}
func (c *Crawler) writeJobFile(path, runID string, targetNew int, seenPath string) error {
func (c *Crawler) writeJobFile(path, runID string, targetNew, candidateBudget int, seenPath string) error {
cfg := json.RawMessage([]byte("{}"))
if raw := strings.TrimSpace(c.cfg.ConfigJSON); raw != "" {
if !json.Valid([]byte(raw)) {
@@ -425,7 +435,9 @@ func (c *Crawler) writeJobFile(path, runID string, targetNew int, seenPath strin
Mode: "crawl",
RunID: runID,
CrawlerID: c.cfg.Driver.ID(),
TargetNew: targetNew,
TargetNew: candidateBudget,
UniqueTarget: targetNew,
CandidateBudget: candidateBudget,
SeenSourceIDsFile: seenPath,
OutputDir: outputDir,
Config: cfg,
@@ -442,7 +454,7 @@ func (c *Crawler) writeJobFile(path, runID string, targetNew int, seenPath strin
return os.Rename(tmp, path)
}
func (c *Crawler) startScript(ctx context.Context, jobPath string, targetNew int) (*exec.Cmd, io.ReadCloser, error) {
func (c *Crawler) startScript(ctx context.Context, jobPath string, targetNew, candidateBudget int) (*exec.Cmd, io.ReadCloser, error) {
cmd := exec.CommandContext(ctx, c.cfg.PythonPath, c.cfg.ScriptPath, "--job", jobPath)
if strings.TrimSpace(c.cfg.WorkDir) != "" {
cmd.Dir = c.cfg.WorkDir
@@ -466,7 +478,7 @@ func (c *Crawler) startScript(ctx context.Context, jobPath string, targetNew int
_ = stdout.Close()
return nil, nil, err
}
log.Printf("[scriptcrawler] drive=%s exec %s --job=%s target_new=%d", c.cfg.Driver.ID(), c.cfg.ScriptPath, jobPath, targetNew)
log.Printf("[scriptcrawler] drive=%s exec %s --job=%s unique_target=%d candidate_budget=%d", c.cfg.Driver.ID(), c.cfg.ScriptPath, jobPath, targetNew, candidateBudget)
if err := cmd.Start(); err != nil {
_ = stdout.Close()
_ = stderr.Close()
@@ -493,7 +505,8 @@ func (c *Crawler) processItem(ctx context.Context, item Item) (bool, error) {
if err != nil {
return false, err
}
videoID := BuildVideoIDForKind(c.sourceKind(), c.cfg.Driver.ID(), sourceID)
sourceKind := c.sourceKind()
videoID := BuildVideoIDForKind(sourceKind, c.cfg.Driver.ID(), sourceID)
if deleted, err := c.cfg.Catalog.IsVideoDeleted(ctx, videoID); err != nil {
return false, err
} else if deleted {
@@ -513,25 +526,6 @@ func (c *Crawler) processItem(ctx context.Context, item Item) (bool, error) {
return false, fmt.Errorf("video: %w", err)
}
thumbReady := false
if item.Thumbnail.URL != "" || item.Thumbnail.LocalFile != "" {
thumbFile := sourceID + detectThumbExt(item.Thumbnail.URL, item.Thumbnail.LocalFile)
thumbPath, err := c.cfg.Driver.ThumbPath(thumbFile)
if err == nil {
if _, err := c.materializeMedia(ctx, item.Thumbnail, thumbPath, item.DetailURL, false); err != nil {
log.Printf("[scriptcrawler] drive=%s source_id=%s thumbnail failed: %v", c.cfg.Driver.ID(), sourceID, err)
} else if c.cfg.CommonThumbDir != "" {
if err := os.MkdirAll(c.cfg.CommonThumbDir, 0o755); err != nil {
log.Printf("[scriptcrawler] drive=%s common thumbs mkdir: %v", c.cfg.Driver.ID(), err)
} else if err := copyFileAtomic(thumbPath, mediaasset.ThumbnailPathInDir(c.cfg.CommonThumbDir, videoID)); err != nil {
log.Printf("[scriptcrawler] drive=%s source_id=%s copy thumbnail: %v", c.cfg.Driver.ID(), sourceID, err)
} else {
thumbReady = true
}
}
}
}
now := time.Now()
title := strings.TrimSpace(item.Title)
if title == "" {
@@ -545,6 +539,9 @@ func (c *Crawler) processItem(ctx context.Context, item Item) (bool, error) {
if matched, err := c.cfg.Catalog.MatchTags(ctx, title+" "+author+" "+strings.Join(tags, " ")); err == nil {
tags = mergeStringLists(tags, matched)
}
if crawlerTag := c.crawlerTagName(); crawlerTag != "" {
tags = mergeStringLists(tags, []string{crawlerTag})
}
publishedAt := now
if parsed := parsePublishedAt(item.PublishedAt); !parsed.IsZero() {
publishedAt = parsed
@@ -572,6 +569,43 @@ func (c *Crawler) processItem(ctx context.Context, item Item) (bool, error) {
CreatedAt: now,
UpdatedAt: now,
}
sampled, err := fingerprint.Compute(ctx, c.cfg.Driver, v, fingerprint.Config{}, c.cfg.HTTPClient)
if err != nil {
_ = os.Remove(videoPath)
return false, fmt.Errorf("fingerprint: %w", err)
}
v.SampledSHA256 = sampled
v.FingerprintStatus = "ready"
if duplicate, err := c.cfg.Catalog.FindEquivalentVideo(ctx, v); err == nil && duplicate != nil {
_ = os.Remove(videoPath)
if markErr := c.cfg.Catalog.MarkCrawlerSourceSeen(ctx, sourceKind, c.cfg.Driver.ID(), sourceID, "duplicate", duplicate.ID, sampled, size); markErr != nil {
log.Printf("[scriptcrawler] drive=%s source_id=%s mark duplicate seen: %v", c.cfg.Driver.ID(), sourceID, markErr)
}
log.Printf("[scriptcrawler] drive=%s source_id=%s duplicate_of=%s title=%q size=%d", c.cfg.Driver.ID(), sourceID, duplicate.ID, title, size)
return false, nil
} else if err != nil && !errors.Is(err, sql.ErrNoRows) {
_ = os.Remove(videoPath)
return false, fmt.Errorf("duplicate lookup: %w", err)
}
thumbReady := false
if item.Thumbnail.URL != "" || item.Thumbnail.LocalFile != "" {
thumbFile := sourceID + detectThumbExt(item.Thumbnail.URL, item.Thumbnail.LocalFile)
thumbPath, err := c.cfg.Driver.ThumbPath(thumbFile)
if err == nil {
if _, err := c.materializeMedia(ctx, item.Thumbnail, thumbPath, item.DetailURL, false); err != nil {
log.Printf("[scriptcrawler] drive=%s source_id=%s thumbnail failed: %v", c.cfg.Driver.ID(), sourceID, err)
} else if c.cfg.CommonThumbDir != "" {
if err := os.MkdirAll(c.cfg.CommonThumbDir, 0o755); err != nil {
log.Printf("[scriptcrawler] drive=%s common thumbs mkdir: %v", c.cfg.Driver.ID(), err)
} else if err := copyFileAtomic(thumbPath, mediaasset.ThumbnailPathInDir(c.cfg.CommonThumbDir, videoID)); err != nil {
log.Printf("[scriptcrawler] drive=%s source_id=%s copy thumbnail: %v", c.cfg.Driver.ID(), sourceID, err)
} else {
thumbReady = true
}
}
}
}
if thumbReady {
v.ThumbnailURL = "/p/thumb/" + v.ID
}
@@ -579,6 +613,9 @@ func (c *Crawler) processItem(ctx context.Context, item Item) (bool, error) {
_ = os.Remove(videoPath)
return false, err
}
if err := c.cfg.Catalog.MarkCrawlerSourceSeen(ctx, sourceKind, c.cfg.Driver.ID(), sourceID, "imported", v.ID, sampled, size); err != nil {
log.Printf("[scriptcrawler] drive=%s source_id=%s mark imported seen: %v", c.cfg.Driver.ID(), sourceID, err)
}
log.Printf("[scriptcrawler] drive=%s source_id=%s ok title=%q size=%d", c.cfg.Driver.ID(), sourceID, title, size)
return true, nil
}
@@ -898,6 +935,36 @@ func (c *Crawler) sourceKind() string {
return Kind
}
func (c *Crawler) crawlerTagName() string {
if c == nil {
return ""
}
if v := strings.TrimSpace(c.cfg.CrawlerName); v != "" {
return v
}
if c.cfg.Driver != nil {
return strings.TrimSpace(c.cfg.Driver.ID())
}
return ""
}
func candidateBudgetForTarget(targetNew int) int {
if targetNew <= 0 {
targetNew = DefaultTargetNew
}
budget := targetNew * defaultCandidateMultiplier
if floor := targetNew + defaultCandidateFloorExtra; budget < floor {
budget = floor
}
if budget > defaultCandidateBudgetMax {
budget = defaultCandidateBudgetMax
}
if budget < targetNew {
return targetNew
}
return budget
}
func BuildVideoID(driveID, sourceID string) string {
return BuildVideoIDForKind(Kind, driveID, sourceID)
}
@@ -10,8 +10,15 @@ import (
"path/filepath"
"strings"
"testing"
"time"
"github.com/video-site/backend/internal/catalog"
"github.com/video-site/backend/internal/fingerprint"
)
const (
scriptCrawlerDuplicateBytes = "duplicate-video-bytes"
scriptCrawlerUniqueBytes = "unique-video-bytes"
)
func TestCrawlerRunOnceImportsLocalFileAndSkipsExisting(t *testing.T) {
@@ -44,6 +51,7 @@ func TestCrawlerRunOnceImportsLocalFileAndSkipsExisting(t *testing.T) {
c := NewCrawler(CrawlerConfig{
Driver: drv,
Catalog: cat,
CrawlerName: "Demo Crawler",
PythonPath: wrapper,
ScriptPath: dummyScript,
})
@@ -61,6 +69,9 @@ func TestCrawlerRunOnceImportsLocalFileAndSkipsExisting(t *testing.T) {
if v.Title != "Imported From Helper" || v.FileID != "abc-123.mp4" || v.Size == 0 {
t.Fatalf("video = title:%q file:%q size:%d", v.Title, v.FileID, v.Size)
}
if !hasString(v.Tags, "Demo Crawler") {
t.Fatalf("video tags = %#v, want crawler name tag", v.Tags)
}
if _, err := os.Stat(filepath.Join(drv.VideosDir(), "abc-123.mp4")); err != nil {
t.Fatalf("video file not copied: %v", err)
}
@@ -267,6 +278,113 @@ func TestCrawlerRunOnceImportsSimpleMediaURLWithoutSourceID(t *testing.T) {
}
}
func TestCrawlerRunOnceSkipsFingerprintDuplicateAndContinues(t *testing.T) {
ctx := context.Background()
tmp := t.TempDir()
cat, err := catalog.Open(filepath.Join(tmp, "catalog.db"))
if err != nil {
t.Fatalf("open catalog: %v", err)
}
t.Cleanup(func() {
if err := cat.Close(); err != nil {
t.Fatalf("close catalog: %v", err)
}
})
drv := New(Config{ID: "demo", RootDir: filepath.Join(tmp, "crawler")})
if err := drv.Init(ctx); err != nil {
t.Fatalf("driver init: %v", err)
}
seedFile := "seed-canonical.mp4"
if err := os.WriteFile(filepath.Join(drv.VideosDir(), seedFile), []byte(scriptCrawlerDuplicateBytes), 0o644); err != nil {
t.Fatalf("write seed video: %v", err)
}
seed := &catalog.Video{
ID: "seed-for-hash",
DriveID: drv.ID(),
FileID: seedFile,
Title: "Seed",
Size: int64(len(scriptCrawlerDuplicateBytes)),
PublishedAt: time.Now(),
}
sampled, err := fingerprint.Compute(ctx, drv, seed, fingerprint.Config{}, nil)
if err != nil {
t.Fatalf("compute seed fingerprint: %v", err)
}
_ = os.Remove(filepath.Join(drv.VideosDir(), seedFile))
now := time.Now()
if err := cat.UpsertVideo(ctx, &catalog.Video{
ID: "existing-canonical",
DriveID: "other-drive",
FileID: "existing.mp4",
FileName: "existing.mp4",
Title: "Existing Canonical",
Size: int64(len(scriptCrawlerDuplicateBytes)),
Ext: "mp4",
SampledSHA256: sampled,
FingerprintStatus: "ready",
PublishedAt: now,
CreatedAt: now,
UpdatedAt: now,
}); err != nil {
t.Fatalf("seed canonical video: %v", err)
}
dummyScript := filepath.Join(tmp, "helper-script")
if err := os.WriteFile(dummyScript, []byte("helper"), 0o755); err != nil {
t.Fatalf("write dummy script: %v", err)
}
wrapper := filepath.Join(tmp, "helper-wrapper.sh")
wrapperScript := fmt.Sprintf("#!/bin/sh\nexec %q -test.run=TestScriptCrawlerHelperProcess \"$@\"\n", os.Args[0])
if err := os.WriteFile(wrapper, []byte(wrapperScript), 0o755); err != nil {
t.Fatalf("write helper wrapper: %v", err)
}
t.Setenv("GO_WANT_SCRIPTCRAWLER_HELPER", "1")
t.Setenv("GO_WANT_SCRIPTCRAWLER_DUP_UNIQUE", "1")
c := NewCrawler(CrawlerConfig{
Driver: drv,
Catalog: cat,
PythonPath: wrapper,
ScriptPath: dummyScript,
})
res, err := c.RunOnce(ctx, 1)
if err != nil {
t.Fatalf("run once: %v", err)
}
if res.NewVideos != 1 || res.Skipped != 1 || res.Failed != 0 || res.TotalEntries != 2 {
t.Fatalf("result = total:%d new:%d skipped:%d failed:%d, want 2/1/1/0", res.TotalEntries, res.NewVideos, res.Skipped, res.Failed)
}
if res.CandidateBudget <= res.TargetNew {
t.Fatalf("candidate budget = %d, target = %d; want expanded budget", res.CandidateBudget, res.TargetNew)
}
if _, err := cat.GetVideo(ctx, BuildVideoID("demo", "dup-source")); err == nil {
t.Fatal("duplicate candidate should not be imported")
}
if _, err := os.Stat(filepath.Join(drv.VideosDir(), "dup-source.mp4")); !os.IsNotExist(err) {
t.Fatalf("duplicate local file stat = %v, want removed", err)
}
v, err := cat.GetVideo(ctx, BuildVideoID("demo", "unique-source"))
if err != nil {
t.Fatalf("unique video should be imported: %v", err)
}
if v.SampledSHA256 == "" || v.FingerprintStatus != "ready" {
t.Fatalf("unique fingerprint = %q status=%q, want ready sampled fingerprint", v.SampledSHA256, v.FingerprintStatus)
}
seen, err := cat.ListCrawlerSourceIDs(ctx, Kind, "demo")
if err != nil {
t.Fatalf("list seen source ids: %v", err)
}
seenSet := map[string]bool{}
for _, id := range seen {
seenSet[id] = true
}
if !seenSet["dup-source"] || !seenSet["unique-source"] {
t.Fatalf("seen ids = %#v, want duplicate and imported source ids", seen)
}
}
func TestScriptCrawlerHelperProcess(t *testing.T) {
if os.Getenv("GO_WANT_SCRIPTCRAWLER_HELPER") != "1" {
return
@@ -307,6 +425,41 @@ func TestScriptCrawlerHelperProcess(t *testing.T) {
_ = json.NewEncoder(os.Stdout).Encode(event)
os.Exit(0)
}
if os.Getenv("GO_WANT_SCRIPTCRAWLER_DUP_UNIQUE") == "1" {
duplicateFile := filepath.Join(job.OutputDir, "duplicate.mp4")
if err := os.WriteFile(duplicateFile, []byte(scriptCrawlerDuplicateBytes), 0o644); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(2)
}
uniqueFile := filepath.Join(job.OutputDir, "unique.mp4")
if err := os.WriteFile(uniqueFile, []byte(scriptCrawlerUniqueBytes), 0o644); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(2)
}
for _, event := range []Event{
{
Type: "item",
Item: Item{
SourceID: "dup-source",
Title: "Duplicate Candidate",
Author: "helper",
Media: MediaRef{LocalFile: duplicateFile},
},
},
{
Type: "item",
Item: Item{
SourceID: "unique-source",
Title: "Unique Candidate",
Author: "helper",
Media: MediaRef{LocalFile: uniqueFile},
},
},
} {
_ = json.NewEncoder(os.Stdout).Encode(event)
}
os.Exit(0)
}
localFile := filepath.Join(job.OutputDir, "helper.mp4")
if err := os.WriteFile(localFile, []byte("helper-video"), 0o644); err != nil {
fmt.Fprintln(os.Stderr, err)
@@ -324,3 +477,12 @@ func TestScriptCrawlerHelperProcess(t *testing.T) {
_ = json.NewEncoder(os.Stdout).Encode(event)
os.Exit(0)
}
func hasString(values []string, want string) bool {
for _, value := range values {
if value == want {
return true
}
}
return false
}
@@ -147,6 +147,46 @@ func (d *Driver) EnsureDir(context.Context, string) (string, error) {
return "", drives.ErrNotSupported
}
func (d *Driver) Remove(ctx context.Context, fileID string) error {
if err := ctx.Err(); err != nil {
return err
}
videoPath, err := d.VideoPath(fileID)
if err != nil {
return err
}
info, err := os.Stat(videoPath)
if err != nil {
if os.IsNotExist(err) {
removeThumbCandidates(d.ThumbPath, strings.TrimSuffix(fileID, filepath.Ext(fileID)))
return nil
}
return err
}
if info.IsDir() {
return errors.New("scriptcrawler: refusing to remove directory")
}
if err := os.Remove(videoPath); err != nil && !os.IsNotExist(err) {
return err
}
removeThumbCandidates(d.ThumbPath, strings.TrimSuffix(fileID, filepath.Ext(fileID)))
return nil
}
func removeThumbCandidates(pathFor func(string) (string, error), stem string) {
stem = strings.TrimSpace(stem)
if stem == "" {
return
}
for _, ext := range []string{".jpg", ".jpeg", ".png", ".webp"} {
path, err := pathFor(stem + ext)
if err != nil {
continue
}
_ = os.Remove(path)
}
}
func safeJoin(root, fileID string) (string, error) {
id := strings.TrimSpace(fileID)
if id == "" || filepath.Base(id) != id {
@@ -170,3 +210,4 @@ func safeJoin(root, fileID string) (string, error) {
}
var _ drives.Drive = (*Driver)(nil)
var _ drives.Remover = (*Driver)(nil)
@@ -167,6 +167,46 @@ func (d *Driver) EnsureDir(ctx context.Context, pathFromRoot string) (string, er
return "", drives.ErrNotSupported
}
func (d *Driver) Remove(ctx context.Context, fileID string) error {
if err := ctx.Err(); err != nil {
return err
}
videoPath, err := d.VideoPath(fileID)
if err != nil {
return err
}
info, err := os.Stat(videoPath)
if err != nil {
if os.IsNotExist(err) {
removeThumbCandidates(d.ThumbPath, strings.TrimSuffix(fileID, filepath.Ext(fileID)))
return nil
}
return err
}
if info.IsDir() {
return errors.New("spider91: refusing to remove directory")
}
if err := os.Remove(videoPath); err != nil && !os.IsNotExist(err) {
return err
}
removeThumbCandidates(d.ThumbPath, strings.TrimSuffix(fileID, filepath.Ext(fileID)))
return nil
}
func removeThumbCandidates(pathFor func(string) (string, error), stem string) {
stem = strings.TrimSpace(stem)
if stem == "" {
return
}
for _, ext := range []string{".jpg", ".jpeg", ".png", ".webp"} {
path, err := pathFor(stem + ext)
if err != nil {
continue
}
_ = os.Remove(path)
}
}
// safeJoin 把 fileID 拼到 root 下,保证最终路径不会逃出 root。
// fileID 必须是单纯的文件名(不含 / 或 .. 等组件)。
func safeJoin(root, fileID string) (string, error) {
@@ -192,3 +232,4 @@ func safeJoin(root, fileID string) (string, error) {
}
var _ drives.Drive = (*Driver)(nil)
var _ drives.Remover = (*Driver)(nil)
+18
View File
@@ -11,6 +11,7 @@ import (
"time"
sdk "github.com/OpenListTeam/wopan-sdk-go"
"github.com/go-resty/resty/v2"
"github.com/video-site/backend/internal/drives"
)
@@ -145,6 +146,22 @@ func (d *Driver) Upload(ctx context.Context, parentID, name string, r io.Reader,
return fid, nil
}
func (d *Driver) Remove(ctx context.Context, fileID string) error {
if d.client == nil {
return fmt.Errorf("wopan remove: driver not initialized")
}
fileID = strings.TrimSpace(fileID)
if fileID == "" {
return fmt.Errorf("wopan remove: empty file id")
}
if err := d.client.DeleteFile(d.spaceType(), nil, []string{fileID}, func(req *resty.Request) {
req.SetContext(ctx)
}); err != nil {
return fmt.Errorf("wopan remove: %w", err)
}
return nil
}
func (d *Driver) EnsureDir(ctx context.Context, pathFromRoot string) (string, error) {
parts := splitPath(pathFromRoot)
currentID := d.rootID
@@ -229,3 +246,4 @@ func guessMime(name string) string {
// 确保实现接口
var _ drives.Drive = (*Driver)(nil)
var _ drives.Remover = (*Driver)(nil)
+90 -1
View File
@@ -82,6 +82,15 @@ type UploadResult struct {
Size int64
}
type UploadProgress struct {
DriveID string
State string
CurrentTitle string
QueueLength int
DoneCount int
TotalCount int
}
const (
spider91UploadDirName = "91 Spider"
scriptCrawlerUploadRootDirName = "Script Crawlers"
@@ -257,6 +266,7 @@ type Config struct {
CaptchaCooldown time.Duration
CommonThumbDir string
OnMigrated func(videoID string)
OnUploadProgress func(UploadProgress)
}
type Migrator struct {
@@ -444,6 +454,20 @@ func (m *Migrator) runOnce(ctx context.Context) {
}
}
func (m *Migrator) reportUploadProgress(progress UploadProgress) {
if m == nil || m.cfg.OnUploadProgress == nil {
return
}
progress.DriveID = strings.TrimSpace(progress.DriveID)
if progress.DriveID == "" {
return
}
if progress.State == "" {
progress.State = "idle"
}
m.cfg.OnUploadProgress(progress)
}
// targetKindForLog 把当前目标盘 kind 转成对人友好的简称,用于日志。
// 解析失败时回退 "target"。
func (m *Migrator) targetKindForLog() string {
@@ -669,8 +693,17 @@ func (m *Migrator) migrateDrive(ctx context.Context, plan migrationPlan) (int, e
if skip < len(files) {
candidates = files[skip:]
} else {
m.reportUploadProgress(UploadProgress{DriveID: src.ID(), State: "idle"})
return 0, nil
}
totalCandidates := len(candidates)
m.reportUploadProgress(UploadProgress{
DriveID: src.ID(),
State: "uploading",
QueueLength: totalCandidates,
TotalCount: totalCandidates,
})
defer m.reportUploadProgress(UploadProgress{DriveID: src.ID(), State: "idle"})
localVideos, err := m.cfg.Catalog.ListVideosByDriveID(ctx, src.ID(), 100000)
if err != nil {
@@ -684,7 +717,8 @@ func (m *Migrator) migrateDrive(ctx context.Context, plan migrationPlan) (int, e
}
migrated := 0
for _, f := range candidates {
processed := 0
for index, f := range candidates {
if err := ctx.Err(); err != nil {
return migrated, err
}
@@ -694,11 +728,35 @@ func (m *Migrator) migrateDrive(ctx context.Context, plan migrationPlan) (int, e
v := m.findVideoForLocalFile(ctx, plan, f.name, byFileID)
if v == nil {
processed++
m.reportUploadProgress(UploadProgress{
DriveID: src.ID(),
State: "uploading",
QueueLength: maxInt(totalCandidates-processed, 0),
DoneCount: processed,
TotalCount: totalCandidates,
})
continue
}
m.reportUploadProgress(UploadProgress{
DriveID: src.ID(),
State: "uploading",
CurrentTitle: v.Title,
QueueLength: maxInt(totalCandidates-index-1, 0),
DoneCount: processed,
TotalCount: totalCandidates,
})
if v.DriveID != src.ID() {
CleanupSpider91Local(src, f.name)
processed++
m.reportUploadProgress(UploadProgress{
DriveID: src.ID(),
State: "uploading",
QueueLength: maxInt(totalCandidates-processed, 0),
DoneCount: processed,
TotalCount: totalCandidates,
})
continue
}
@@ -718,6 +776,14 @@ func (m *Migrator) migrateDrive(ctx context.Context, plan migrationPlan) (int, e
m.cfg.OnMigrated(v.ID)
}
}
processed++
m.reportUploadProgress(UploadProgress{
DriveID: src.ID(),
State: "uploading",
QueueLength: maxInt(totalCandidates-processed, 0),
DoneCount: processed,
TotalCount: totalCandidates,
})
continue
}
@@ -728,6 +794,14 @@ func (m *Migrator) migrateDrive(ctx context.Context, plan migrationPlan) (int, e
continue
}
if !ready {
processed++
m.reportUploadProgress(UploadProgress{
DriveID: src.ID(),
State: "uploading",
QueueLength: maxInt(totalCandidates-processed, 0),
DoneCount: processed,
TotalCount: totalCandidates,
})
continue
}
}
@@ -752,10 +826,25 @@ func (m *Migrator) migrateDrive(ctx context.Context, plan migrationPlan) (int, e
m.cfg.OnMigrated(v.ID)
}
}
processed++
m.reportUploadProgress(UploadProgress{
DriveID: src.ID(),
State: "uploading",
QueueLength: maxInt(totalCandidates-processed, 0),
DoneCount: processed,
TotalCount: totalCandidates,
})
}
return migrated, nil
}
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
func (m *Migrator) findVideoForLocalFile(ctx context.Context, plan migrationPlan, localFile string, byFileID map[string]*catalog.Video) *catalog.Video {
if v := byFileID[localFile]; v != nil {
return v
+14 -2
View File
@@ -28,7 +28,9 @@ python3 /path/to/crawler.py --job /path/to/job.json
"mode": "crawl",
"run_id": "20260609T120000Z",
"crawler_id": "example",
"target_new": 10,
"target_new": 100,
"unique_target": 10,
"candidate_budget": 100,
"seen_source_ids_file": "/data/scriptcrawlers/example/.crawl/seen.txt",
"output_dir": "/data/scriptcrawlers/example/output",
"config": {
@@ -40,6 +42,14 @@ python3 /path/to/crawler.py --job /path/to/job.json
}
```
`unique_target` is the user's requested number of content-unique new videos.
`candidate_budget` is how many candidate items the script should emit at most.
For backward compatibility, `target_new` is set to the same value as
`candidate_budget`, because older scripts only read `target_new`.
The backend may skip candidates that are already present by sampled content
fingerprint. Skipped duplicate candidates do not count toward `unique_target`.
## Importing Scripts
Crawler scripts are configured from the admin crawler page. A script can be
@@ -118,4 +128,6 @@ Optional progress/done events:
`output_dir`.
- Scripts can read `seen_source_ids_file` and skip known IDs when they provide
stable `source_id` values. The backend still dedupes every item.
- The backend stops the process after `target_new` new videos are imported.
- The backend stops the process after `unique_target` content-unique new videos
are imported. Scripts should stop after emitting `candidate_budget` candidates
even if the backend has not reached `unique_target`.
+4
View File
@@ -1,3 +1,4 @@
import type { ReactNode } from "react";
import { AlertTriangle } from "lucide-react";
import { Modal } from "./Modal";
@@ -12,6 +13,7 @@ type ConfirmModalProps = {
centerMessage?: boolean;
modalClassName?: string;
loading?: boolean;
children?: ReactNode;
onCancel: () => void;
onConfirm: () => void;
};
@@ -27,6 +29,7 @@ export function ConfirmModal({
centerMessage = false,
modalClassName = "",
loading = false,
children,
onCancel,
onConfirm,
}: ConfirmModalProps) {
@@ -65,6 +68,7 @@ export function ConfirmModal({
))}
</ul>
)}
{children}
</div>
</div>
</Modal>
+60 -4
View File
@@ -30,7 +30,7 @@ import { generationStateClass, generationStateLabel } from "./drive/constants";
import { Spider91UploadTargetField } from "./drive/Spider91UploadTargetField";
import { SpiderIcon } from "./icons/SpiderIcon";
const BUSY_STATES = new Set(["scanning", "generating", "queued"]);
const BUSY_STATES = new Set(["scanning", "generating", "uploading", "queued"]);
const POLL_INTERVAL_MS = 5000;
const UPLOAD_TARGET_KINDS = new Set(["p115", "pikpak", "p123", "googledrive", "onedrive"]);
@@ -43,7 +43,8 @@ function crawlerBusy(crawler: api.AdminCrawler) {
statusBusy(crawler.scanGenerationStatus) ||
statusBusy(crawler.thumbnailGenerationStatus) ||
statusBusy(crawler.previewGenerationStatus) ||
statusBusy(crawler.fingerprintGenerationStatus)
statusBusy(crawler.fingerprintGenerationStatus) ||
statusBusy(crawler.uploadGenerationStatus)
);
}
@@ -273,12 +274,14 @@ function crawlerStages(crawler: api.AdminCrawler): StageInfo[] {
{ key: "thumbnail", label: "封面", status: crawler.thumbnailGenerationStatus },
{ key: "preview", label: "预览", status: crawler.previewGenerationStatus },
{ key: "fingerprint", label: "指纹", status: crawler.fingerprintGenerationStatus },
{ key: "upload", label: "上传", status: crawler.uploadGenerationStatus },
];
}
function stageStateLabel(stage: StageInfo): string {
const state = stage.status?.state || "idle";
if (stage.key === "scan" && state === "scanning") return "抓取中";
if (stage.key === "upload" && state === "uploading") return "上传中";
return generationStateLabel(state);
}
@@ -364,6 +367,7 @@ function CrawlerRow({
function CrawlerDetail({ crawler }: { crawler: api.AdminCrawler }) {
const scan = crawler.scanGenerationStatus;
const upload = crawlerUploadDisplayStatus(crawler);
return (
<div className="admin-crawler-detail">
<div className="admin-crawler-detail__grid">
@@ -373,12 +377,21 @@ function CrawlerDetail({ crawler }: { crawler: api.AdminCrawler }) {
stateText={scan?.state === "scanning" ? "抓取中" : generationStateLabel(scan?.state || "idle")}
counts={[
{ label: "累计爬取", value: crawler.totalCrawledCount ?? 0 },
{ label: "本地保留", value: crawler.localVideoCount ?? 0 },
{ label: "已上传", value: crawler.migratedVideoCount ?? 0 },
{ label: "本轮检查", value: scan?.scannedCount ?? 0 },
{ label: "本轮新增", value: scan?.addedCount ?? 0 },
]}
/>
<GenStageCard
label="上传"
status={upload.status}
stateText={upload.text}
counts={[
{ label: "已上传", value: crawler.migratedVideoCount ?? 0 },
{ label: crawler.uploadDriveId ? "待上传" : "本地保留", value: crawler.localVideoCount ?? 0 },
{ label: "本轮处理", value: upload.status.doneCount ?? 0 },
{ label: "本轮总数", value: upload.status.totalCount ?? 0 },
]}
/>
<GenStageCard
label="封面"
status={crawler.thumbnailGenerationStatus}
@@ -417,6 +430,49 @@ function CrawlerDetail({ crawler }: { crawler: api.AdminCrawler }) {
);
}
function crawlerUploadDisplayStatus(crawler: api.AdminCrawler): {
status: api.DriveGenerationStatus;
text: string;
} {
const live = crawler.uploadGenerationStatus;
const state = live?.state || "idle";
const localCount = crawler.localVideoCount ?? 0;
const totalCount = crawler.totalCrawledCount ?? 0;
const base: api.DriveGenerationStatus = {
state,
currentTitle: live?.currentTitle,
queueLength: live?.queueLength ?? 0,
cooldownUntil: live?.cooldownUntil,
scannedCount: live?.scannedCount ?? 0,
addedCount: live?.addedCount ?? 0,
doneCount: live?.doneCount ?? 0,
totalCount: live?.totalCount ?? 0,
};
if (!crawler.uploadDriveId) {
return {
status: base,
text: localCount > 0 ? "本地保存" : generationStateLabel(state),
};
}
if (state === "uploading") {
return { status: base, text: "上传中" };
}
if (state === "queued") {
return { status: base, text: "排队中" };
}
if (localCount > 0) {
return {
status: { ...base, state: "queued", queueLength: localCount },
text: "待上传",
};
}
if (totalCount > 0) {
return { status: base, text: "完成" };
}
return { status: base, text: generationStateLabel(state) };
}
function GenStageCard({
label,
status,
+52 -7
View File
@@ -27,8 +27,10 @@ export function VideosPage() {
const [batchRegening, setBatchRegening] = useState(false);
const [batchDeleteOpen, setBatchDeleteOpen] = useState(false);
const [batchDeleting, setBatchDeleting] = useState(false);
const [batchDeleteSource, setBatchDeleteSource] = useState(false);
const [deleteTarget, setDeleteTarget] = useState<api.AdminVideo | null>(null);
const [deleting, setDeleting] = useState(false);
const [deleteSource, setDeleteSource] = useState(false);
const pageSize = useVideosPageSize();
const { show } = useToast();
@@ -100,6 +102,7 @@ export function VideosPage() {
async function handleBatchDelete() {
if (selectedIds.size === 0) return;
setBatchDeleteSource(false);
setBatchDeleteOpen(true);
}
@@ -127,14 +130,15 @@ export function VideosPage() {
const target = deleteTarget;
setDeleting(true);
try {
const result = await api.deleteVideo(target.id);
const result = await api.deleteVideo(target.id, { deleteSource });
setDeleteTarget(null);
setDeleteSource(false);
setSelectedIds((ids) => {
const next = new Set(ids);
next.delete(target.id);
return next;
});
show(result.deletedSource ? "已删除视频,并清理 91Spider 源文件" : "已删除视频", "success");
show(result.deletedSource ? "已删除视频,并清理源文件" : "已删除视频", "success");
if (listItems.length === 1 && page > 1) {
setPage((p) => Math.max(1, p - 1));
} else {
@@ -156,7 +160,7 @@ export function VideosPage() {
let deletedSources = 0;
for (const id of ids) {
try {
const result = await api.deleteVideo(id);
const result = await api.deleteVideo(id, { deleteSource: batchDeleteSource });
success++;
if (result.deletedSource) deletedSources++;
} catch {
@@ -165,13 +169,14 @@ export function VideosPage() {
}
const failed = ids.length - success;
if (failed === 0) {
const extra = deletedSources > 0 ? `,其中 ${deletedSources} 个清理了 91Spider 源文件` : "";
const extra = deletedSources > 0 ? `,其中 ${deletedSources} 个清理了源文件` : "";
show(`批量删除完成,成功 ${success}${extra}`, "success");
} else {
show(`批量删除完成,成功 ${success} / ${ids.length} 个,失败 ${failed}`, success > 0 ? "info" : "error");
}
setSelectedIds(new Set());
setBatchDeleteOpen(false);
setBatchDeleteSource(false);
if (success >= listItems.length && page > 1) {
setPage((p) => Math.max(1, p - 1));
} else {
@@ -363,7 +368,15 @@ export function VideosPage() {
<button type="button" className="admin-btn" onClick={() => handleRegen(v)} title="重生预览视频">
<RefreshCw size={13} />
</button>{" "}
<button type="button" className="admin-btn is-danger" onClick={() => setDeleteTarget(v)} title="删除视频">
<button
type="button"
className="admin-btn is-danger"
onClick={() => {
setDeleteSource(false);
setDeleteTarget(v);
}}
title="删除视频"
>
<Trash2 size={13} />
</button>
</td>
@@ -443,10 +456,26 @@ export function VideosPage() {
modalClassName="admin-modal--delete-confirm"
loading={deleting}
onCancel={() => {
if (!deleting) setDeleteTarget(null);
if (!deleting) {
setDeleteTarget(null);
setDeleteSource(false);
}
}}
onConfirm={confirmDeleteVideo}
>
<label className="admin-delete-source-option">
<input
type="checkbox"
checked={deleteSource}
disabled={deleting}
onChange={(e) => setDeleteSource(e.target.checked)}
/>
<span>
<strong></strong>
<small></small>
</span>
</label>
</ConfirmModal>
<ConfirmModal
open={batchDeleteOpen}
title="批量删除视频"
@@ -457,10 +486,26 @@ export function VideosPage() {
modalClassName="admin-modal--delete-confirm"
loading={batchDeleting}
onCancel={() => {
if (!batchDeleting) setBatchDeleteOpen(false);
if (!batchDeleting) {
setBatchDeleteOpen(false);
setBatchDeleteSource(false);
}
}}
onConfirm={confirmBatchDelete}
>
<label className="admin-delete-source-option">
<input
type="checkbox"
checked={batchDeleteSource}
disabled={batchDeleting}
onChange={(e) => setBatchDeleteSource(e.target.checked)}
/>
<span>
<strong></strong>
<small></small>
</span>
</label>
</ConfirmModal>
</section>
);
}
+8 -2
View File
@@ -121,6 +121,8 @@ export type DriveGenerationStatus = {
cooldownUntil?: string;
scannedCount: number;
addedCount: number;
doneCount: number;
totalCount: number;
};
export function listDrives() {
@@ -206,6 +208,7 @@ export type AdminCrawler = {
thumbnailGenerationStatus?: DriveGenerationStatus;
previewGenerationStatus?: DriveGenerationStatus;
fingerprintGenerationStatus?: DriveGenerationStatus;
uploadGenerationStatus?: DriveGenerationStatus;
thumbnailReadyCount: number;
thumbnailPendingCount: number;
thumbnailFailedCount: number;
@@ -483,10 +486,13 @@ export function updateVideo(id: string, body: UpdateVideoInput) {
});
}
export function deleteVideo(id: string) {
export function deleteVideo(id: string, options: { deleteSource?: boolean } = {}) {
return request<{ ok: boolean; deletedSource: boolean }>(
`/videos/${encodeURIComponent(id)}`,
{ method: "DELETE" }
{
method: "DELETE",
body: JSON.stringify({ deleteSource: !!options.deleteSource }),
}
);
}
+3 -2
View File
@@ -72,6 +72,7 @@ export function nightlyBusyText(status: { running: boolean; queued: boolean }) {
export function generationStateLabel(state: string): string {
if (state === "scanning") return "扫盘中";
if (state === "uploading") return "上传中";
if (state === "generating") return "生成中";
if (state === "cooling") return "冷却中";
if (state === "queued") return "排队中";
@@ -79,8 +80,8 @@ export function generationStateLabel(state: string): string {
}
export function generationStateClass(state: string): string {
if (state === "scanning" || state === "generating" || state === "cooling" || state === "queued") {
if (state === "scanning") return "generating";
if (state === "scanning" || state === "uploading" || state === "generating" || state === "cooling" || state === "queued") {
if (state === "scanning" || state === "uploading") return "generating";
return state;
}
return "idle";
+21 -2
View File
@@ -1,12 +1,14 @@
import { useEffect, useState } from "react";
import { EyeOff, ThumbsDown, ThumbsUp } from "lucide-react";
import { EyeOff, ThumbsDown, ThumbsUp, Trash2 } from "lucide-react";
import type { VideoDetail } from "@/types";
import { formatCount } from "@/lib/format";
type Props = {
video: VideoDetail;
onHideVideo: () => void;
onDeleteVideo: () => void;
hideSaving?: boolean;
deleteSaving?: boolean;
};
/**
@@ -19,7 +21,13 @@ type Props = {
* - state
* -
*/
export function VideoActions({ video, onHideVideo, hideSaving }: Props) {
export function VideoActions({
video,
onHideVideo,
onDeleteVideo,
hideSaving,
deleteSaving,
}: Props) {
const [likes, setLikes] = useState(video.likes ?? 0);
const [dislikes, setDislikes] = useState(video.dislikes ?? 0);
const [bursting, setBursting] = useState(false);
@@ -121,6 +129,17 @@ export function VideoActions({ video, onHideVideo, hideSaving }: Props) {
<EyeOff size={16} />
<span>{hideSaving ? "处理中" : "不再显示"}</span>
</button>
<button
type="button"
className="vd-actions__btn vd-actions__delete"
onClick={onDeleteVideo}
disabled={deleteSaving}
aria-label="删除这个视频"
>
<Trash2 size={16} />
<span>{deleteSaving ? "删除中" : "删除"}</span>
</button>
</div>
);
}
+13
View File
@@ -52,6 +52,19 @@ export function hideVideo(id: string): Promise<{ ok: boolean }> {
);
}
export function deleteVideo(
id: string,
options: { deleteSource?: boolean } = {}
): Promise<{ ok: boolean; deletedSource: boolean }> {
return apiJSON<{ ok: boolean; deletedSource: boolean }>(
`/admin/api/videos/${encodeURIComponent(id)}`,
{
method: "DELETE",
body: JSON.stringify({ deleteSource: !!options.deleteSource }),
}
);
}
export function recordView(id: string): Promise<{ views: number }> {
return apiJSON<{ views: number }>(
`/api/video/${encodeURIComponent(id)}/view`,
+90
View File
@@ -7,6 +7,7 @@ import { VideoMetaHeader } from "@/components/VideoMetaHeader";
import { VideoInfoPanel } from "@/components/VideoInfoPanel";
import { RecommendedRail } from "@/components/RecommendedRail";
import {
deleteVideo,
fetchTags,
fetchVideoDetail,
hideVideo,
@@ -23,6 +24,10 @@ export default function VideoDetailPage() {
const [loading, setLoading] = useState(true);
const [tagSaving, setTagSaving] = useState(false);
const [hideSaving, setHideSaving] = useState(false);
const [deleteOpen, setDeleteOpen] = useState(false);
const [deleteSource, setDeleteSource] = useState(false);
const [deleteSaving, setDeleteSaving] = useState(false);
const [deleteError, setDeleteError] = useState("");
const detailTopRef = useRef<HTMLDivElement | null>(null);
useEffect(() => {
@@ -76,6 +81,36 @@ export default function VideoDetailPage() {
}
}
function handleOpenDelete() {
if (!detail || deleteSaving) return;
setDeleteSource(false);
setDeleteError("");
setDeleteOpen(true);
}
function handleCloseDelete() {
if (deleteSaving) return;
setDeleteOpen(false);
setDeleteError("");
}
async function handleConfirmDelete() {
if (!detail || deleteSaving) return;
setDeleteSaving(true);
setDeleteError("");
try {
await deleteVideo(detail.id, { deleteSource });
navigate("/list", { replace: true });
} catch {
setDeleteError(
deleteSource
? "删除失败。源文件未能删除时,管理库记录会保留。"
: "删除失败,请稍后重试。"
);
setDeleteSaving(false);
}
}
function handleFirstPlay() {
if (!detail) return;
// 失败静默忽略,不打扰用户播放体验
@@ -199,7 +234,9 @@ export default function VideoDetailPage() {
<VideoActions
video={detail}
onHideVideo={handleHideVideo}
onDeleteVideo={handleOpenDelete}
hideSaving={hideSaving}
deleteSaving={deleteSaving}
/>
</section>
@@ -215,6 +252,59 @@ export default function VideoDetailPage() {
</div>
</div>
</div>
{deleteOpen && (
<div className="vd-delete-modal" role="presentation">
<div
className="vd-delete-dialog"
role="dialog"
aria-modal="true"
aria-labelledby="vd-delete-title"
>
<div className="vd-delete-head">
<h2 id="vd-delete-title" className="vd-delete-title">
</h2>
<p className="vd-delete-text">
{detail.title}
</p>
</div>
<label className="vd-delete-option">
<input
type="checkbox"
checked={deleteSource}
disabled={deleteSaving}
onChange={(e) => setDeleteSource(e.target.checked)}
/>
<span>
<strong></strong>
</span>
</label>
{deleteError && <div className="vd-delete-error">{deleteError}</div>}
<div className="vd-delete-actions">
<button
type="button"
className="vd-delete-action vd-delete-cancel"
onClick={handleCloseDelete}
disabled={deleteSaving}
>
</button>
<button
type="button"
className="vd-delete-action vd-delete-confirm"
onClick={handleConfirmDelete}
disabled={deleteSaving}
>
{deleteSaving ? "删除中..." : "删除"}
</button>
</div>
</div>
</div>
)}
</AppShell>
);
}
+44
View File
@@ -1871,6 +1871,50 @@
line-height: 1.7;
}
.admin-delete-source-option {
display: grid;
grid-template-columns: 18px minmax(0, 1fr);
gap: var(--space-2);
align-items: start;
margin-top: var(--space-3);
padding: var(--space-3);
border: 1px solid var(--border-subtle);
border-radius: var(--radius-sm);
background: var(--bg-sunken);
color: var(--text-default);
cursor: pointer;
}
.admin-delete-source-option input {
width: 16px;
height: 16px;
margin: 2px 0 0;
accent-color: var(--danger);
}
.admin-delete-source-option span {
min-width: 0;
display: grid;
gap: 2px;
}
.admin-delete-source-option strong {
color: var(--text-strong);
font-size: var(--font-sm);
font-weight: var(--weight-semibold);
}
.admin-delete-source-option small {
color: var(--text-muted);
font-size: var(--font-xs);
line-height: var(--line-relaxed);
}
.admin-delete-source-option:has(input:disabled) {
cursor: default;
opacity: 0.72;
}
/* =========================================================
* Toast
* ========================================================= */
+207 -1
View File
@@ -751,7 +751,7 @@
}
}
/* "不再显示"按钮:克制、hover 才露出 danger */
/* 次要操作按钮:克制、hover 才露出 danger */
.vd-actions__btn {
display: inline-flex;
align-items: center;
@@ -788,6 +788,182 @@
margin-left: auto;
}
/* ---------- Delete confirm modal ---------- */
.vd-delete-modal {
position: fixed;
inset: 0;
z-index: var(--z-modal);
display: grid;
place-items: center;
padding: var(--space-4);
background: var(--bg-overlay);
backdrop-filter: blur(8px);
animation: vd-delete-fade var(--duration-fast) var(--ease-out);
}
@keyframes vd-delete-fade {
from {
opacity: 0;
}
to {
opacity: 1;
}
}
.vd-delete-dialog {
width: min(460px, 100%);
max-height: 90vh;
overflow: auto;
padding: var(--space-5);
border: 1px solid var(--border-default);
border-radius: var(--radius-sm);
background: var(--bg-surface);
color: var(--text-default);
box-shadow: var(--shadow-xl);
animation: vd-delete-pop var(--duration-normal) var(--ease-out);
}
@keyframes vd-delete-pop {
from {
opacity: 0;
transform: translateY(8px) scale(0.98);
}
to {
opacity: 1;
transform: none;
}
}
.vd-delete-head {
display: grid;
gap: var(--space-2);
margin-bottom: var(--space-4);
}
.vd-delete-title {
margin: 0;
color: var(--text-strong);
font-size: var(--font-xl);
font-weight: var(--weight-bold);
line-height: var(--line-tight);
letter-spacing: 0;
}
.vd-delete-text {
margin: 0;
color: var(--text-default);
font-size: var(--font-md);
line-height: var(--line-relaxed);
overflow-wrap: anywhere;
}
.vd-delete-option {
display: grid;
grid-template-columns: 18px minmax(0, 1fr);
gap: var(--space-2);
align-items: start;
padding: var(--space-3);
border: 1px solid var(--border-subtle);
border-radius: var(--radius-sm);
background: var(--bg-sunken);
cursor: pointer;
}
.vd-delete-option input {
width: 16px;
height: 16px;
margin: 2px 0 0;
accent-color: var(--danger);
}
.vd-delete-option span {
min-width: 0;
display: grid;
gap: 2px;
}
.vd-delete-option strong {
color: var(--text-strong);
font-size: var(--font-sm);
font-weight: var(--weight-semibold);
}
.vd-delete-option small {
color: var(--text-muted);
font-size: var(--font-xs);
line-height: var(--line-relaxed);
}
.vd-delete-option:has(input:disabled) {
cursor: default;
opacity: 0.72;
}
.vd-delete-error {
margin-top: var(--space-3);
padding: var(--space-3);
border: 1px solid rgba(241, 85, 108, 0.3);
border-radius: var(--radius-sm);
background: var(--danger-soft);
color: var(--danger);
font-size: var(--font-sm);
line-height: var(--line-relaxed);
}
.vd-delete-actions {
display: flex;
justify-content: flex-end;
gap: var(--space-2);
margin-top: var(--space-5);
}
.vd-delete-action {
display: inline-flex;
align-items: center;
justify-content: center;
min-width: 86px;
height: 40px;
padding: 0 var(--space-4);
border: 1px solid var(--border-default);
border-radius: var(--radius-sm);
background: var(--bg-elevated);
color: var(--text-default);
font-size: var(--font-sm);
font-weight: var(--weight-semibold);
cursor: pointer;
transition: all var(--transition-fast);
}
.vd-delete-action:hover:not(:disabled) {
border-color: var(--border-strong);
color: var(--text-strong);
}
.vd-delete-action:focus-visible {
outline: 2px solid var(--accent);
outline-offset: 2px;
}
.vd-delete-action:disabled {
opacity: 0.55;
cursor: not-allowed;
}
.vd-delete-confirm {
border-color: var(--danger);
background: var(--danger);
color: #fff;
}
.vd-delete-confirm:hover:not(:disabled) {
border-color: var(--danger);
background: var(--danger);
color: #fff;
filter: brightness(1.04);
}
/* ---------- Info card (description + tags) ---------- */
.vd-info {
display: flex;
@@ -1743,6 +1919,36 @@
display: none;
}
.vd-actions__delete {
width: 44px;
padding: 0;
justify-content: center;
flex: 0 0 auto;
}
.vd-actions__delete span {
display: none;
}
.vd-delete-modal {
align-items: end;
padding: var(--space-3);
}
.vd-delete-dialog {
width: 100%;
padding: var(--space-4);
}
.vd-delete-actions {
display: grid;
grid-template-columns: 1fr 1fr;
}
.vd-delete-action {
width: 100%;
}
.vd-info__desc {
padding: var(--space-3);
font-size: var(--font-base);