fix: improve 91Spider tagging and deduped tag filters

This commit is contained in:
nianzhibai
2026-06-01 18:51:56 +08:00
parent e01b7cc3b7
commit e36a17f99d
6 changed files with 394 additions and 19 deletions
+5 -4
View File
@@ -857,13 +857,14 @@ func (a *App) attachSpider91Crawler(d *catalog.Drive, drv *spider91.Driver) {
a.spider91Crawlers[driveID] = c
a.mu.Unlock()
// 确保 "91porn" 系统标签存在,并把已入库的 spider91 视频按 author 字段
// 匹配补打这个标签(CreateTagAndClassify 内部对所有视频走一遍 classify)。
// 重复调用是幂等的:tags 用 INSERT OR IGNOREvideo_tags 也是 INSERT OR IGNORE
// 确保 "91porn" 系统标签存在,并 spider91 来源前缀给历史视频补打。
// 不能只靠文本匹配:老版本入库的视频可能没有 author/tags 字段,但 id 前缀
// "spider91-<driveID>-" 会一直保留,即使后续迁移到 PikPak/115 也不变
bgCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
go func() {
defer cancel()
if _, err := a.cat.CreateTagAndClassify(bgCtx, spider91.DefaultTag, nil, "system"); err != nil {
prefix := "spider91-" + driveID + "-"
if _, err := a.cat.EnsureTagForVideoIDPrefix(bgCtx, prefix, spider91.DefaultTag, nil, "system"); err != nil {
log.Printf("[spider91] ensure %q tag: %v", spider91.DefaultTag, err)
}
}()
+1 -6
View File
@@ -853,12 +853,7 @@ func (c *Catalog) ListVideos(ctx context.Context, p ListParams) ([]*Video, int,
args = append(args, p.DriveID)
}
if p.Tag != "" {
where = append(where, `EXISTS (
SELECT 1
FROM video_tags vt
JOIN tags t ON t.id = vt.tag_id
WHERE vt.video_id = videos.id AND t.label = ? COLLATE NOCASE
)`)
where = append(where, videoMatchesTagLabelSQL("videos"))
args = append(args, p.Tag)
}
if p.Category != "" && p.Category != "all" {
+170 -3
View File
@@ -403,6 +403,57 @@ func (c *Catalog) CreateTagAndClassify(ctx context.Context, label string, aliase
return c.classifyTag(ctx, tag)
}
func (c *Catalog) EnsureTagForVideoIDPrefix(ctx context.Context, prefix, label string, aliases []string, source string) (int, error) {
prefix = strings.TrimSpace(prefix)
if prefix == "" {
return 0, errors.New("video id prefix is required")
}
tag, err := c.ensureTag(ctx, label, aliases, source)
if err != nil {
return 0, err
}
rows, err := c.db.QueryContext(ctx, `
SELECT v.id
FROM videos v
WHERE v.id LIKE ? || '%'
AND COALESCE(v.tags_manual, 0) = 0
AND NOT EXISTS (
SELECT 1
FROM video_tags vt
WHERE vt.video_id = v.id
AND vt.tag_id = ?
)
ORDER BY v.id ASC`, prefix, tag.ID)
if err != nil {
return 0, err
}
var videoIDs []string
for rows.Next() {
var videoID string
if err := rows.Scan(&videoID); err != nil {
rows.Close()
return 0, err
}
videoIDs = append(videoIDs, videoID)
}
if err := rows.Err(); err != nil {
rows.Close()
return 0, err
}
if err := rows.Close(); err != nil {
return 0, err
}
for _, videoID := range videoIDs {
if err := c.insertVideoTag(ctx, videoID, tag.ID, "auto"); err != nil {
return 0, err
}
if err := c.syncVideoTagsJSON(ctx, videoID, false); err != nil {
return 0, err
}
}
return len(videoIDs), nil
}
func (c *Catalog) DeleteTag(ctx context.Context, tagID int64) (int, error) {
tx, err := c.db.BeginTx(ctx, nil)
if err != nil {
@@ -464,10 +515,66 @@ func (c *Catalog) DeleteTag(ctx context.Context, tagID int64) (int, error) {
func (c *Catalog) ListTags(ctx context.Context) ([]Tag, error) {
rows, err := c.db.QueryContext(ctx, `
SELECT t.id, t.label, t.aliases, t.source, COUNT(v.id) AS cnt
WITH tagged_tags AS (
SELECT vt.tag_id,
tagged.id,
COALESCE(tagged.content_hash, '') AS content_hash,
COALESCE(tagged.sampled_sha256, '') AS sampled_sha256,
tagged.size_bytes,
COALESCE(tagged.file_name, '') AS file_name
FROM video_tags vt
JOIN videos tagged ON tagged.id = vt.video_id
WHERE COALESCE(tagged.hidden, 0) = 0
),
tag_candidates AS (
SELECT tag_id, id AS video_id
FROM tagged_tags
UNION ALL
SELECT tag_id,
(SELECT canonical.id
FROM videos canonical
WHERE tagged_tags.content_hash != ''
AND canonical.content_hash = tagged_tags.content_hash
AND COALESCE(canonical.content_hash, '') != ''
ORDER BY canonical.created_at ASC, canonical.id ASC
LIMIT 1) AS video_id
FROM tagged_tags
WHERE content_hash != ''
UNION ALL
SELECT tag_id,
(SELECT canonical.id
FROM videos canonical
WHERE tagged_tags.sampled_sha256 != ''
AND tagged_tags.size_bytes > 0
AND canonical.sampled_sha256 = tagged_tags.sampled_sha256
AND canonical.size_bytes = tagged_tags.size_bytes
AND COALESCE(canonical.sampled_sha256, '') != ''
AND canonical.size_bytes > 0
ORDER BY canonical.created_at ASC, canonical.id ASC
LIMIT 1) AS video_id
FROM tagged_tags
WHERE sampled_sha256 != '' AND size_bytes > 0
UNION ALL
SELECT tag_id,
(SELECT canonical.id
FROM videos canonical
WHERE tagged_tags.file_name != ''
AND tagged_tags.size_bytes > 0
AND canonical.file_name = tagged_tags.file_name
AND canonical.size_bytes = tagged_tags.size_bytes
AND COALESCE(canonical.file_name, '') != ''
AND canonical.size_bytes > 0
ORDER BY canonical.created_at ASC, canonical.id ASC
LIMIT 1) AS video_id
FROM tagged_tags
WHERE file_name != '' AND size_bytes > 0
)
SELECT t.id, t.label, t.aliases, t.source, COUNT(DISTINCT videos.id) AS cnt
FROM tags t
LEFT JOIN video_tags vt ON vt.tag_id = t.id
LEFT JOIN videos v ON v.id = vt.video_id AND COALESCE(v.hidden, 0) = 0
LEFT JOIN tag_candidates tc ON tc.tag_id = t.id AND tc.video_id IS NOT NULL
LEFT JOIN videos ON videos.id = tc.video_id
AND COALESCE(videos.hidden, 0) = 0
AND `+uniqueVideoWhereSQL+`
GROUP BY t.id, t.label, t.aliases, t.source
ORDER BY cnt DESC, t.label ASC`)
if err != nil {
@@ -485,6 +592,66 @@ ORDER BY cnt DESC, t.label ASC`)
return out, nil
}
func videoMatchesTagLabelSQL(videoAlias string) string {
return fmt.Sprintf(`%s.id IN (
WITH tagged_videos AS (
SELECT tagged.id,
COALESCE(tagged.content_hash, '') AS content_hash,
COALESCE(tagged.sampled_sha256, '') AS sampled_sha256,
tagged.size_bytes,
COALESCE(tagged.file_name, '') AS file_name
FROM video_tags vt
JOIN tags tag_filter ON tag_filter.id = vt.tag_id
JOIN videos tagged ON tagged.id = vt.video_id
WHERE tag_filter.label = ? COLLATE NOCASE
AND COALESCE(tagged.hidden, 0) = 0
),
tag_candidates AS (
SELECT id AS video_id
FROM tagged_videos
UNION ALL
SELECT (SELECT canonical.id
FROM videos canonical
WHERE tagged_videos.content_hash != ''
AND canonical.content_hash = tagged_videos.content_hash
AND COALESCE(canonical.content_hash, '') != ''
ORDER BY canonical.created_at ASC, canonical.id ASC
LIMIT 1) AS video_id
FROM tagged_videos
WHERE content_hash != ''
UNION ALL
SELECT (SELECT canonical.id
FROM videos canonical
WHERE tagged_videos.sampled_sha256 != ''
AND tagged_videos.size_bytes > 0
AND canonical.sampled_sha256 = tagged_videos.sampled_sha256
AND canonical.size_bytes = tagged_videos.size_bytes
AND COALESCE(canonical.sampled_sha256, '') != ''
AND canonical.size_bytes > 0
ORDER BY canonical.created_at ASC, canonical.id ASC
LIMIT 1) AS video_id
FROM tagged_videos
WHERE sampled_sha256 != '' AND size_bytes > 0
UNION ALL
SELECT (SELECT canonical.id
FROM videos canonical
WHERE tagged_videos.file_name != ''
AND tagged_videos.size_bytes > 0
AND canonical.file_name = tagged_videos.file_name
AND canonical.size_bytes = tagged_videos.size_bytes
AND COALESCE(canonical.file_name, '') != ''
AND canonical.size_bytes > 0
ORDER BY canonical.created_at ASC, canonical.id ASC
LIMIT 1) AS video_id
FROM tagged_videos
WHERE file_name != '' AND size_bytes > 0
)
SELECT video_id
FROM tag_candidates
WHERE video_id IS NOT NULL
)`, videoAlias)
}
func (c *Catalog) SetManualVideoTags(ctx context.Context, videoID string, labels []string) error {
if _, err := c.GetVideo(ctx, videoID); err != nil {
return err
+161
View File
@@ -326,6 +326,75 @@ func TestCreateTagAndClassifyRestoresDeletedTag(t *testing.T) {
}
}
func TestEnsureTagForVideoIDPrefixBackfillsSourceTag(t *testing.T) {
ctx := context.Background()
cat, err := Open(t.TempDir() + "/catalog.db")
if err != nil {
t.Fatalf("open catalog: %v", err)
}
t.Cleanup(func() {
if err := cat.Close(); err != nil {
t.Fatalf("close catalog: %v", err)
}
})
now := time.Now()
for _, seed := range []struct {
id string
manual bool
}{
{id: "spider91-91-spider-1200001"},
{id: "spider91-91-spider-1200002", manual: true},
{id: "spider91-other-1200003"},
} {
if err := cat.UpsertVideo(ctx, &Video{
ID: seed.id,
DriveID: "91-spider",
FileID: seed.id + ".mp4",
Title: "legacy title without source text",
PublishedAt: now,
CreatedAt: now,
UpdatedAt: now,
}); err != nil {
t.Fatalf("seed %s: %v", seed.id, err)
}
if seed.manual {
if err := cat.SetManualVideoTags(ctx, seed.id, nil); err != nil {
t.Fatalf("mark %s manual: %v", seed.id, err)
}
}
}
added, err := cat.EnsureTagForVideoIDPrefix(ctx, "spider91-91-spider-", "91porn", nil, "system")
if err != nil {
t.Fatalf("ensure prefix tag: %v", err)
}
if added != 1 {
t.Fatalf("added = %d, want 1", added)
}
got, err := cat.GetVideo(ctx, "spider91-91-spider-1200001")
if err != nil {
t.Fatalf("get tagged video: %v", err)
}
if !sameStrings(got.Tags, []string{"91porn"}) {
t.Fatalf("tagged video tags = %#v, want 91porn", got.Tags)
}
manual, err := cat.GetVideo(ctx, "spider91-91-spider-1200002")
if err != nil {
t.Fatalf("get manual video: %v", err)
}
if len(manual.Tags) != 0 {
t.Fatalf("manual video tags = %#v, want unchanged", manual.Tags)
}
other, err := cat.GetVideo(ctx, "spider91-other-1200003")
if err != nil {
t.Fatalf("get other prefix video: %v", err)
}
if len(other.Tags) != 0 {
t.Fatalf("other prefix video tags = %#v, want unchanged", other.Tags)
}
}
func TestDeleteTagRejectsSystemTags(t *testing.T) {
ctx := context.Background()
cat, err := Open(t.TempDir() + "/catalog.db")
@@ -937,6 +1006,98 @@ func TestListVideosHidesDuplicateContentHashes(t *testing.T) {
}
}
func TestTagFilterMatchesCanonicalDuplicateVideo(t *testing.T) {
ctx := context.Background()
cat, err := Open(t.TempDir() + "/catalog.db")
if err != nil {
t.Fatalf("open catalog: %v", err)
}
t.Cleanup(func() {
if err := cat.Close(); err != nil {
t.Fatalf("close catalog: %v", err)
}
})
now := time.Now()
for _, v := range []*Video{
{
ID: "pikpak-canonical",
DriveID: "pikpak",
FileID: "canonical.mp4",
Title: "Canonical",
Size: 1024,
PublishedAt: now,
CreatedAt: now,
UpdatedAt: now,
},
{
ID: "spider91-dup-1",
DriveID: "91-spider",
FileID: "dup-1.mp4",
Title: "Spider duplicate 1",
Tags: []string{"91porn"},
Size: 1024,
PublishedAt: now.Add(time.Second),
CreatedAt: now.Add(time.Second),
UpdatedAt: now.Add(time.Second),
},
{
ID: "spider91-dup-2",
DriveID: "91-spider",
FileID: "dup-2.mp4",
Title: "Spider duplicate 2",
Tags: []string{"91porn"},
Size: 1024,
PublishedAt: now.Add(2 * time.Second),
CreatedAt: now.Add(2 * time.Second),
UpdatedAt: now.Add(2 * time.Second),
},
{
ID: "spider91-visible",
DriveID: "91-spider",
FileID: "visible.mp4",
Title: "Spider visible",
Tags: []string{"91porn"},
Size: 2048,
PublishedAt: now.Add(3 * time.Second),
CreatedAt: now.Add(3 * time.Second),
UpdatedAt: now.Add(3 * time.Second),
},
} {
if err := cat.UpsertVideo(ctx, v); err != nil {
t.Fatalf("seed %s: %v", v.ID, err)
}
}
for _, id := range []string{"pikpak-canonical", "spider91-dup-1", "spider91-dup-2"} {
if err := cat.UpdateVideoFingerprint(ctx, id, "same-sampled-sha256", "ready", ""); err != nil {
t.Fatalf("fingerprint %s: %v", id, err)
}
}
if err := cat.UpdateVideoFingerprint(ctx, "spider91-visible", "unique-sampled-sha256", "ready", ""); err != nil {
t.Fatalf("fingerprint visible: %v", err)
}
items, total, err := cat.ListVideos(ctx, ListParams{Tag: "91porn", Page: 1, PageSize: 10})
if err != nil {
t.Fatalf("list videos by tag: %v", err)
}
if total != 2 || len(items) != 2 {
t.Fatalf("tagged videos total=%d len=%d, want 2", total, len(items))
}
gotIDs := map[string]bool{}
for _, item := range items {
gotIDs[item.ID] = true
}
for _, want := range []string{"pikpak-canonical", "spider91-visible"} {
if !gotIDs[want] {
t.Fatalf("tagged video ids = %#v, want %s", gotIDs, want)
}
}
if got := mustTagByLabel(t, ctx, cat, "91porn").Count; got != 2 {
t.Fatalf("91porn count = %d, want 2 visible canonical videos", got)
}
}
func TestListVideosCanFilterReadyThumbnails(t *testing.T) {
ctx := context.Background()
cat, err := Open(t.TempDir() + "/catalog.db")
+33 -5
View File
@@ -533,6 +533,17 @@ func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVid
}
}
title := strings.TrimSpace(item.Title)
if title == "" {
title = sourceID
}
tags := []string{DefaultTag}
if matched, err := c.cfg.Catalog.MatchTags(ctx, title+" "+DefaultAuthor); err == nil {
tags = mergeCatalogTags(tags, matched)
} else {
log.Printf("[spider91] drive=%s viewkey=%s source_id=%s match tags: %v", c.cfg.Driver.ID(), viewkey, sourceID, err)
}
// 入库
now := time.Now()
v := &catalog.Video{
@@ -540,9 +551,9 @@ func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVid
DriveID: c.cfg.Driver.ID(),
FileID: videoFile,
FileName: videoFile,
Title: strings.TrimSpace(item.Title),
Title: title,
Author: DefaultAuthor,
Tags: []string{DefaultTag},
Tags: tags,
Ext: strings.TrimPrefix(videoExt, "."),
Quality: "HD",
Size: videoSize,
@@ -551,9 +562,6 @@ func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVid
CreatedAt: now,
UpdatedAt: now,
}
if v.Title == "" {
v.Title = sourceID
}
if thumbReady {
// 设了 ThumbnailURL 后 thumb worker 会跳过这条视频,
// 不再尝试用 ffmpeg 抽帧(封面已经是网站原图)。
@@ -994,6 +1002,26 @@ func copyFileAtomic(src, dst string) error {
return os.Rename(tmp, dst)
}
func mergeCatalogTags(lists ...[]string) []string {
out := []string{}
seen := map[string]bool{}
for _, list := range lists {
for _, tag := range list {
tag = strings.TrimSpace(tag)
if tag == "" {
continue
}
key := strings.ToLower(tag)
if seen[key] {
continue
}
seen[key] = true
out = append(out, tag)
}
}
return out
}
// BuildVideoID 给定 driveID + 91 源视频 ID,按统一规则生成 catalog 中 videos.id。
// 与 scanner 用法一致:<kind>-<driveID>-<fileID>。
func BuildVideoID(driveID, sourceID string) string {
@@ -55,7 +55,7 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
// 同时仍写 --output 文件作归档。
videoEntries := []map[string]string{
{
"title": "Video One",
"title": "Video One 口交",
"thumb_url": srv.URL + "/thumb/not-120001.jpg",
"video_url": srv.URL + "/videos/120001.mp4",
"viewkey": "vk-001",
@@ -96,6 +96,9 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
}); err != nil {
t.Fatalf("upsert drive: %v", err)
}
if _, err := cat.CreateTagAndClassify(context.Background(), "Video One", nil, "user"); err != nil {
t.Fatalf("create user tag: %v", err)
}
var newVideos []*catalog.Video
c := NewCrawler(CrawlerConfig{
@@ -190,6 +193,17 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
if !hasDefaultTag {
t.Fatalf("video %s tags = %v, want contain %q", videoID, v.Tags, DefaultTag)
}
if sourceID == "120001" {
if !containsString(v.Tags, "口交") {
t.Fatalf("video %s tags = %v, want contain built-in tag 口交", videoID, v.Tags)
}
if !containsString(v.Tags, "Video One") {
t.Fatalf("video %s tags = %v, want contain user tag Video One", videoID, v.Tags)
}
}
if sourceID == "120002" && (containsString(v.Tags, "口交") || containsString(v.Tags, "Video One")) {
t.Fatalf("video %s tags = %v, should not inherit tags from other spider91 videos", videoID, v.Tags)
}
}
// 7. 第二次 RunOnce:源视频 ID 已存在 → 全部 skipped,无新文件下载
@@ -762,3 +776,12 @@ func buildFakeSpiderScript(entries []map[string]string) string {
sb.WriteString("fi\n")
return sb.String()
}
func containsString(values []string, want string) bool {
for _, value := range values {
if value == want {
return true
}
}
return false
}