fix: harden spider91 source matching

2026-06-15 08:45:41 +08:00 · 2026-05-28 16:10:20 +08:00
parent 7540371838
commit d2d4db8062
8 changed files with 904 additions and 165 deletions
@@ -24,7 +24,7 @@
 CLI 参数:
    --page N                  只爬第 N 页，配合 --output 用于手动调试
    --target-new N            从 page 1 起翻页直到凑够 N 个新视频（不在 seen 列表里的）
-    --seen-viewkeys-file FILE 每行一个已知 viewkey，命中即跳过；与 --target-new 配合使用
+    --seen-viewkeys-file FILE 每行一个已知 viewkey 或 mp4 源 ID，命中即跳过；与 --target-new 配合使用
    --output FILE             输出 JSON 路径，覆盖默认的 OUTPUT_FILE
    --no-resume               禁用断点续爬（单页/target-new 模式下自动禁用）
    --quiet                   压缩日志，每条视频只输出一行
@@ -44,6 +44,7 @@ CLI 参数:
          "thumb_url": "https://...thumb/xxxx.jpg",
          "video_url": "https://...mp43/xxxx.mp4?st=...",
          "viewkey": "abc123...",
+          "source_id": "xxxx",
          "detail_url": "https://...view_video.php?viewkey=..."
        },
        ...
@@ -69,7 +70,7 @@ import json
 import os
 import sys
 import html
-from urllib.parse import urljoin, unquote
+from urllib.parse import urljoin, unquote, urlparse
 from datetime import datetime

 try:
@@ -153,6 +154,9 @@ class Porn91Spider:
        """
        self.session = requests.Session()
        self.session.headers.update(HEADERS)
+        # 91porn 没有固定 mode cookie 时，详情页首次请求可能返回与列表卡片
+        # 不一致的视频源；固定桌面模式让列表页和详情页解析保持一致。
+        self.session.cookies.set("mode", "d")

        # 解析后的实际配置；优先使用构造参数，回退到模块级配置
        self.output_file = output_file if output_file is not None else OUTPUT_FILE
@@ -164,7 +168,7 @@ class Porn91Spider:
        self.max_empty_pages = (
            MAX_EMPTY_PAGES if max_empty_pages is None else int(max_empty_pages)
        )
-        # target_new 是 backend 触发时的核心模式：累计处理这么多新 viewkey 后退出。
+        # target_new 是 backend 触发时的核心模式：累计处理这么多新源视频后退出。
        self.target_new = target_new if target_new and target_new > 0 else None
        self.quiet = bool(quiet)
        # stream_output：每解析出一个 video 直链立即输出一行 JSON 到 stdout
@@ -186,7 +190,7 @@ class Porn91Spider:
            self.session.mount("http://", adapter)
        except ImportError:
            pass  # urllib3 版本可能较低
-        
+
        self.results = []
        self.pages_crawled = 0
        self.processed_videos = 0
@@ -194,9 +198,8 @@ class Porn91Spider:
        self.failed_videos = 0
        self.skip_viewkeys = set()

-        # backend 通过 --seen-viewkeys-file 传进来一批已入库的 viewkey；
-        # 这里直接注入到 skip_viewkeys，让 _process_video_list 在看到这些 viewkey 时
-        # 跳过详情页请求，配合 --target-new 实现"凑够 N 个新"的语义。
+        # backend 通过 --seen-viewkeys-file 传进来一批已入库的历史 ID。
+        # 兼容旧名：文件里可能是 viewkey，也可能是新逻辑使用的 mp4 源 ID。
        if seen_viewkeys:
            for vk in seen_viewkeys:
                if not vk:
@@ -204,7 +207,7 @@ class Porn91Spider:
                vk = vk.strip()
                if vk:
                    self.skip_viewkeys.add(vk)
-        
+
        # 断点续爬：加载已有结果，跳过已处理的 viewkey
        if self.resume and os.path.exists(self.output_file):
            try:
@@ -257,12 +260,12 @@ class Porn91Spider:
        headers_extra = {}
        if referer:
            headers_extra["Referer"] = referer
-        
+
        for attempt in range(1, MAX_RETRIES + 1):
            try:
                self.log(f"正在请求: {description or url} (尝试 {attempt}/{MAX_RETRIES})")
                response = self.session.get(url, timeout=30, headers=headers_extra)
-                
+
                # 检查是否被Cloudflare拦截 (需在 raise_for_status 之前)
                if response.status_code == 403:
                    self.log("警告: 收到 403 Forbidden，可能被拦截")
@@ -270,19 +273,19 @@ class Porn91Spider:
                        self.random_sleep(RETRY_DELAY, RETRY_DELAY + 3)
                        continue
                    return ""
-                
+
                response.raise_for_status()
-                
+
                # 优先使用 content.decode('utf-8')，避免 requests 编码检测问题
                try:
                    html_content = response.content.decode('utf-8', errors='replace')
                except Exception:
                    html_content = response.text
-                
+
                # Cloudflare 挑战检测：如果页面主要内容只有挑战页面，而非正常内容
                # 注意：网站本身会加载 challenge-platform 脚本，所以不能仅凭此判断
                is_cf_challenge = (
-                    "Just a moment" in html_content and 
+                    "Just a moment" in html_content and
                    len(html_content) < 8000
                )
                if is_cf_challenge:
@@ -291,7 +294,7 @@ class Porn91Spider:
                        self.random_sleep(RETRY_DELAY, RETRY_DELAY + 5)
                        continue
                    return ""
-                
+
                return html_content
            except requests.exceptions.HTTPError as e:
                self.log(f"HTTP错误: {e}")
@@ -315,50 +318,61 @@ class Porn91Spider:
        """
        videos = []
        soup = BeautifulSoup(html, 'lxml')
-        
-        # 查找所有视频链接 (每个视频会出现两次: wwtrx 和 awwtrx 的 c 参数不同)
-        # 通过 seen_viewkeys 去重
-        video_links = soup.find_all('a', href=re.compile(r'view_video\.php\?viewkey='))
-        
-        seen_viewkeys = set()
-        
-        for link in video_links:
+
+        # 只解析正常视频卡片。页面中还混有 col-lg-8 的异常大卡片，里面的标题、
+        # thumb、detail URL 会串到其它视频，不能作为入库来源。
+        video_cards = soup.select('div.col-xs-12.col-sm-4.col-md-3.col-lg-3')
+
+        seen_cards = set()
+
+        for card in video_cards:
+            link = card.find('a', href=re.compile(r'view_video\.php\?viewkey='))
+            if not link:
+                continue
            href = link.get('href', '')
            if not href:
                continue
-            
+
            # 提取 viewkey
            match = re.search(r'viewkey=([^&]+)', href)
            if not match:
                continue
            viewkey = match.group(1)
-            
-            if viewkey in seen_viewkeys:
-                continue
-            seen_viewkeys.add(viewkey)
-            
+
            detail_url = urljoin(BASE_URL, href)
-            
+
            # 提取标题
            title = self._extract_title(link)
-            
-            # 提取封面图URL
+
+            # 提取列表卡片来源 ID 和封面图 URL
            thumb_url = ""
+            source_id = ""
+            overlay = link.find(id=re.compile(r'^playvthumb_\d+$'))
+            if overlay:
+                source_id = overlay.get('id', '').rsplit('_', 1)[-1]
            img = link.find('img', class_=re.compile(r'img-responsive'))
            if img:
                thumb_url = img.get('src', '') or img.get('data-original', '')
                if thumb_url:
                    thumb_url = urljoin(BASE_URL, thumb_url)
-            
+            if not source_id and thumb_url:
+                source_id = self._extract_thumb_source_id(thumb_url)
+
+            card_key = source_id or detail_url
+            if card_key in seen_cards:
+                continue
+            seen_cards.add(card_key)
+
            videos.append({
                "title": title,
                "detail_url": detail_url,
                "thumb_url": thumb_url,
-                "viewkey": viewkey
+                "viewkey": viewkey,
+                "source_id": source_id
            })
-        
+
        return videos
-    
+
    def _extract_title(self, link) -> str:
        """
        从视频链接标签中提取并清理标题
@@ -369,12 +383,12 @@ class Porn91Spider:
            title = title_el.get_text(strip=True)
            if title:
                return html.unescape(title)
-        
+
        # 备用: 从 link 的 title 属性提取
        title = link.get('title', '').strip()
        if title:
            return html.unescape(title)
-        
+
        # 最后手段: 从链接文本提取并清理前缀
        text = link.get_text(separator=' ', strip=True)
        # 去掉前缀: "HD" / "91" / 时间戳 "HH:MM:SS"
@@ -385,13 +399,17 @@ class Porn91Spider:
    def parse_detail_page(self, html: str) -> dict:
        """
        解析详情页，提取视频直链
-        返回: {"video_url": "..."} 或空字典
+        返回: {"video_url": "...", "source_id": "...", "title": "..."} 或空字典
        """
        result = {}
-        
+
        if not html:
            return result
-        
+
+        title = self._extract_detail_title(html)
+        if title:
+            result["title"] = title
+
        # 方法1: 解码 strencode2 (主要方式, 页面通过 document.write 动态写入 video 标签)
        # 格式: document.write(strencode2("%3c%73%6f..."));
        strencode_match = re.search(r'strencode2\(["\']([^"\']+)["\']\)', html)
@@ -400,7 +418,7 @@ class Porn91Spider:
            try:
                # strencode2 在JS中等价于 unescape / decodeURIComponent
                decoded = unquote(encoded)
-                
+
                # 从解码后的 HTML 片段中提取 src
                src_match = re.search(r"src=['\"]([^'\"]+)['\"]", decoded)
                if src_match:
@@ -408,10 +426,11 @@ class Porn91Spider:
                    # 规范化双斜杠 (如 https://host//path -> https://host/path)
                    video_url = re.sub(r'(https?://[^/]+)//+', r'\1/', video_url)
                    result["video_url"] = video_url
+                    result["source_id"] = self._extract_source_id(video_url)
                    return result
            except Exception as e:
                self.log(f"  解码 strencode2 失败: {e}")
-        
+
        # 方法2: 通用正则匹配页面中的 mp4 链接 (备用, 过滤广告)
        mp4_match = re.search(
            r'https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*',
@@ -421,10 +440,53 @@ class Porn91Spider:
            url = mp4_match.group(0)
            if 'kwai' not in url and 'ad-' not in url.lower():
                result["video_url"] = url
+                result["source_id"] = self._extract_source_id(url)
                return result
-        
+
        return result

+    def _extract_detail_title(self, html_text: str) -> str:
+        soup = BeautifulSoup(html_text, 'lxml')
+        title_el = soup.find('title')
+        if not title_el:
+            return ""
+        title = title_el.get_text(" ", strip=True)
+        title = re.sub(r'\s*-\s*91porn.*$', '', title, flags=re.IGNORECASE).strip()
+        return html.unescape(title)[:160]
+
+    def _extract_source_id(self, video_url: str) -> str:
+        path = urlparse(video_url or "").path
+        name = os.path.basename(path)
+        stem, ext = os.path.splitext(name)
+        if ext.lower() not in {".mp4", ".m4v", ".mov", ".webm", ".mkv", ".avi"}:
+            return ""
+        source_id = re.sub(r'[^0-9]+', '', stem)
+        if not source_id or source_id != stem:
+            return ""
+        return source_id
+
+    def _extract_thumb_source_id(self, thumb_url: str) -> str:
+        path = urlparse(thumb_url or "").path
+        match = re.search(r'/thumb/(\d+)\.[A-Za-z0-9]+$', path)
+        return match.group(1) if match else ""
+
+    def _thumb_url_for_source(self, thumb_url: str, source_id: str) -> str:
+        if not thumb_url or not source_id:
+            return thumb_url
+        parsed = urlparse(thumb_url)
+        match = re.search(r'/thumb/([^/?#]+)\.[A-Za-z0-9]+$', parsed.path)
+        if not match:
+            return thumb_url
+        current = match.group(1)
+        if current == source_id:
+            return thumb_url
+        path = re.sub(
+            r'/thumb/[^/?#]+\.[A-Za-z0-9]+$',
+            f'/thumb/{source_id}.jpg',
+            parsed.path,
+        )
+        return parsed._replace(path=path, query="", fragment="").geturl()
+
    def crawl(self):
        """
        主爬取流程。停止条件（任一满足即停）：
@@ -468,19 +530,19 @@ class Porn91Spider:
            if crawled_in_session > 0:
                self.log("")
                self.random_sleep(MIN_PAGE_DELAY, MAX_PAGE_DELAY)
-            
+
            self.log(f"[页 {page_num}] 请求: {page_url}")
            page_html = self.fetch_page(page_url, f"列表页 第{page_num}页")
-            
+
            if not page_html:
                self.log(f"[页 {page_num}] 获取失败，跳过")
                consecutive_empty += 1
                page_num += 1
                crawled_in_session += 1
                continue
-            
+
            page_videos = self.parse_list_page(page_html)
-            
+
            # 判断页面是否真的没有视频（而非全部已处理）
            if not page_videos:
                self.log(f"[页 {page_num}] 页面无视频，可能已到末尾")
@@ -488,24 +550,24 @@ class Porn91Spider:
                page_num += 1
                crawled_in_session += 1
                continue
-            
+
            consecutive_empty = 0
-            
+
            # 过滤已处理的 viewkey，只保留新视频
            new_videos = [v for v in page_videos if v['viewkey'] not in self.skip_viewkeys]
            skipped_on_page = len(page_videos) - len(new_videos)
-            
+
            if skipped_on_page > 0:
                self.log(f"[页 {page_num}] 发现 {len(page_videos)} 个链接, 其中 {skipped_on_page} 个已处理, {len(new_videos)} 个新视频")
            else:
                self.log(f"[页 {page_num}] 发现 {len(new_videos)} 个视频")
-            
+
            if new_videos:
                self._process_video_list(new_videos, referer=page_url)
            self.pages_crawled += 1
            page_num += 1
            crawled_in_session += 1
-        
+
        self._save_results()
        self._print_summary()

@@ -522,16 +584,16 @@ class Porn91Spider:
                self.log(f"  [SKIP] 已处理过: {video['viewkey']}")
                self.skipped_videos += 1
                continue
-            
+
            self.log(f"  处理视频 {idx}/{len(videos)}: {video['title'][:40]}...")
-            
+
            # 延时控制 (同一批次内第一个视频不延时)
            if idx > 1:
                self.random_sleep(MIN_DETAIL_DELAY, MAX_DETAIL_DELAY)
-            
+
            # 获取详情页
            detail_html = self.fetch_page(video['detail_url'], f"详情页 viewkey={video['viewkey']}", referer=referer)
-            
+
            if not detail_html:
                self.log(f"  [FAIL] 详情页获取失败: {video['viewkey']}")
                video["video_url"] = ""
@@ -539,14 +601,39 @@ class Porn91Spider:
                self.skip_viewkeys.add(video['viewkey'])
                self.failed_videos += 1
                continue
-            
+
            # 解析视频直链
            detail_info = self.parse_detail_page(detail_html)
-            
+
            if detail_info.get("video_url"):
                video["video_url"] = detail_info["video_url"]
+                if detail_info.get("title"):
+                    video["title"] = detail_info["title"]
+                list_source_id = video.get("source_id", "")
+                detail_source_id = detail_info.get("source_id", "")
+                if list_source_id and detail_source_id and list_source_id != detail_source_id:
+                    self.log(
+                        f"  [FAIL] 详情页视频源不匹配: list_source_id={list_source_id} "
+                        f"detail_source_id={detail_source_id} viewkey={video['viewkey']}"
+                    )
+                    self.failed_videos += 1
+                    self.skip_viewkeys.add(video['viewkey'])
+                    continue
+                if not list_source_id and detail_source_id:
+                    video["source_id"] = detail_source_id
+                if video.get("source_id"):
+                    video["thumb_url"] = self._thumb_url_for_source(
+                        video.get("thumb_url", ""),
+                        video["source_id"],
+                    )
+                    if video["source_id"] in self.skip_viewkeys:
+                        self.log(f"  [SKIP] 已处理过 source_id: {video['source_id']}")
+                        self.skipped_videos += 1
+                        continue
                self.results.append(video)
                self.skip_viewkeys.add(video['viewkey'])
+                if video.get("source_id"):
+                    self.skip_viewkeys.add(video["source_id"])
                self.processed_videos += 1
                self.log(f"  [OK] 成功提取视频直链")
                # 流式：立刻把这条 entry 交给 Go 端开始下载，不等本批余下视频
@@ -572,7 +659,7 @@ class Porn91Spider:
            "failed": self.failed_videos,
            "videos": self.results
        }
-        
+
        try:
            # 保证父目录存在；写入临时文件后原子 rename，避免读到半截 JSON
            out_path = self.output_file
@@ -662,9 +749,9 @@ def main():
    parser.add_argument("--quiet", action="store_true",
                        help="压缩日志，每条视频只输出关键事件")
    parser.add_argument("--target-new", type=int, default=None,
-                        help="目标新增模式：从 page 1 起翻页直到累计处理这么多新 viewkey 后停止（backend 凌晨任务用）")
+                        help="目标新增模式：从 page 1 起翻页直到累计处理这么多新源视频后停止（backend 凌晨任务用）")
    parser.add_argument("--seen-viewkeys-file", type=str, default=None,
-                        help="文件路径，每行一个已处理过的 viewkey；脚本会跳过这些视频不再请求详情页")
+                        help="文件路径，每行一个已处理过的 viewkey 或 mp4 源 ID；脚本会跳过这些视频")
    parser.add_argument("--stream-output", action="store_true",
                        help="流式模式：每解析一条视频直链就立即把它作为一行 JSON 写到 stdout 并 flush；"
                             "日志改走 stderr。配合 backend 边读边下载使用。")
@@ -678,7 +765,7 @@ def main():
 按 Ctrl+C 可随时中断并保存进度
 """)

-    # 加载已知 viewkey（来自 backend 的 catalog 已入库列表）
+    # 加载已知 ID（来自 backend 的 catalog 已入库列表；兼容旧参数名）
    seen_viewkeys = []
    if args.seen_viewkeys_file:
        try:
@@ -2,6 +2,8 @@ package main

 import (
 	"context"
+	"database/sql"
+	"errors"
 	"fmt"
 	"log"
 	"net/http"
@@ -279,11 +281,16 @@ type App struct {
 // 决定。任何"是否入队 preview worker"的判断都应通过这个方法读，避免把状态
 // 散落到 App 内存里和 DB 不一致。
 //
-// 读 catalog 失败时退化成 false（不生成）：比 "默认开" 更安全 —— 读不到状态时
-// 倾向不消耗 ffmpeg；调用方会记日志，运维能立刻看到问题。
+// local-upload 是内置盘，不一定有 catalog.drives 行；缺省按开启处理。
+//
+// 其它 drive 读 catalog 失败时退化成 false（不生成）：比 "默认开" 更安全 —— 读不到
+// 状态时倾向不消耗 ffmpeg；调用方会记日志，运维能立刻看到问题。
 func (a *App) teaserEnabledForDrive(ctx context.Context, driveID string) bool {
 	d, err := a.cat.GetDrive(ctx, driveID)
 	if err != nil {
+		if driveID == localupload.DriveID && errors.Is(err, sql.ErrNoRows) {
+			return true
+		}
 		log.Printf("[preview] read teaser_enabled drive=%s: %v (treating as disabled)", driveID, err)
 		return false
 	}
@@ -46,9 +46,9 @@ func TestRegisterPreviewWorkerBackfillsPendingWhenDriveTeaserEnabled(t *testing.
 	}

 	app := &App{
-		cat:            cat,
-		workers:        make(map[string]*preview.Worker),
-		thumbWorkers:   make(map[string]*preview.ThumbWorker),
+		cat:          cat,
+		workers:      make(map[string]*preview.Worker),
+		thumbWorkers: make(map[string]*preview.ThumbWorker),
 	}
 	worker := preview.NewWorker(&serverFakeTeaserGenerator{}, cat, &serverFakeDrive{})
 	go worker.Run(ctx)
@@ -106,9 +106,9 @@ func TestRegisterPreviewWorkersGenerateThumbnailsBeforePreviews(t *testing.T) {
 	}

 	app := &App{
-		cat:            cat,
-		workers:        make(map[string]*preview.Worker),
-		thumbWorkers:   make(map[string]*preview.ThumbWorker),
+		cat:          cat,
+		workers:      make(map[string]*preview.Worker),
+		thumbWorkers: make(map[string]*preview.ThumbWorker),
 	}
 	gen := &serverFakeTeaserGenerator{}
 	drv := &serverFakeDrive{}
@@ -194,9 +194,9 @@ func TestFailedThumbnailsDoNotBlockPreviewGeneration(t *testing.T) {
 	}

 	app := &App{
-		cat:            cat,
-		workers:        make(map[string]*preview.Worker),
-		thumbWorkers:   make(map[string]*preview.ThumbWorker),
+		cat:          cat,
+		workers:      make(map[string]*preview.Worker),
+		thumbWorkers: make(map[string]*preview.ThumbWorker),
 	}
 	gen := &serverFakeTeaserGenerator{}
 	drv := &serverFakeDrive{}
@@ -261,9 +261,9 @@ func TestRegenFailedPreviewsQueuesOnlyFailedVideosForDrive(t *testing.T) {
 	}

 	app := &App{
-		cat:            cat,
-		workers:        make(map[string]*preview.Worker),
-		thumbWorkers:   make(map[string]*preview.ThumbWorker),
+		cat:          cat,
+		workers:      make(map[string]*preview.Worker),
+		thumbWorkers: make(map[string]*preview.ThumbWorker),
 	}
 	worker := preview.NewWorker(&serverFakeTeaserGenerator{}, cat, &serverFakeDrive{})
 	go worker.Run(ctx)
@@ -311,7 +311,7 @@ func TestRegenFailedPreviewsQueuesOnlyFailedVideosForDrive(t *testing.T) {
 	}
 }

-func TestEnqueueUploadedVideoQueuesLocalPreviewWorker(t *testing.T) {
+func TestEnqueueUploadedVideoQueuesLocalGenerationByDefault(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()

@@ -325,7 +325,6 @@ func TestEnqueueUploadedVideoQueuesLocalPreviewWorker(t *testing.T) {
 		}
 	})

-	seedDriveWithTeaser(t, cat, "local-upload", true)
 	video := &catalog.Video{
 		ID:            "local-upload-video",
 		DriveID:       "local-upload",
@@ -341,14 +340,19 @@ func TestEnqueueUploadedVideoQueuesLocalPreviewWorker(t *testing.T) {
 	}

 	app := &App{
-		cat:            cat,
-		workers:        make(map[string]*preview.Worker),
-		thumbWorkers:   make(map[string]*preview.ThumbWorker),
+		cat:          cat,
+		workers:      make(map[string]*preview.Worker),
+		thumbWorkers: make(map[string]*preview.ThumbWorker),
 	}
-	worker := preview.NewWorker(&serverFakeTeaserGenerator{}, cat, &serverLocalUploadFakeDrive{})
+	gen := &serverFakeTeaserGenerator{}
+	drv := &serverLocalUploadFakeDrive{}
+	worker := preview.NewWorker(gen, cat, drv)
+	thumbWorker := preview.NewThumbWorker(gen, cat, drv)
 	go worker.Run(ctx)
+	go thumbWorker.Run(ctx)
 	app.mu.Lock()
 	app.workers["local-upload"] = worker
+	app.thumbWorkers["local-upload"] = thumbWorker
 	app.mu.Unlock()

 	app.enqueueUploadedVideo(ctx, video)
@@ -359,10 +363,13 @@ func TestEnqueueUploadedVideoQueuesLocalPreviewWorker(t *testing.T) {
 		if err != nil {
 			t.Fatalf("get video: %v", err)
 		}
-		if got.PreviewStatus == "ready" {
+		if got.PreviewStatus == "ready" && got.ThumbnailURL != "" {
 			if got.PreviewLocal != "/tmp/local-upload-video.mp4" {
 				t.Fatalf("preview local = %q, want generated local teaser path", got.PreviewLocal)
 			}
+			if got.ThumbnailURL != "/p/thumb/local-upload-video" {
+				t.Fatalf("thumbnail url = %q, want generated thumbnail URL", got.ThumbnailURL)
+			}
 			return
 		}
 		time.Sleep(10 * time.Millisecond)
@@ -372,7 +379,7 @@ func TestEnqueueUploadedVideoQueuesLocalPreviewWorker(t *testing.T) {
 	if err != nil {
 		t.Fatalf("get video after timeout: %v", err)
 	}
-	t.Fatalf("preview status = %q, want ready", got.PreviewStatus)
+	t.Fatalf("preview status = %q, thumbnail url = %q; want generated local teaser and thumbnail", got.PreviewStatus, got.ThumbnailURL)
 }

 func TestShouldScanDriveSkipsLocalUpload(t *testing.T) {
@@ -543,7 +550,6 @@ type serverLocalUploadFakeDrive struct {

 func (d *serverLocalUploadFakeDrive) ID() string { return "local-upload" }

-
 // seedDriveWithTeaser 在 catalog 里 upsert 一个测试用的 drive 行，把 TeaserEnabled
 // 设为 enabled。teaser 入队判断现在按 per-drive 而不是全局 setting，所以涉及到
 // teaser worker 的测试都要先把 drive 行写进 catalog。
@@ -634,8 +634,8 @@ func (s *Server) handleUploadedVideo(w http.ResponseWriter, r *http.Request) {
 }

 // handleSpider91Video 服务 spider91 drive 下载到本地的视频文件。
-// 路径形如 /p/spider91/<videoID>，videoID = "spider91-<driveID>-<viewkey>"。
-// 通过 catalog 拿到 file_id（"<viewkey>.mp4"），再让 driver 解析到绝对路径并 ServeFile。
+// 路径形如 /p/spider91/<videoID>，videoID = "spider91-<driveID>-<sourceID>"。
+// 通过 catalog 拿到 file_id（"<sourceID>.mp4"），再让 driver 解析到绝对路径并 ServeFile。
 func (s *Server) handleSpider91Video(w http.ResponseWriter, r *http.Request) {
 	videoID := chi.URLParam(r, "videoID")
 	v, err := s.Catalog.GetVideo(r.Context(), videoID)
@@ -172,7 +172,7 @@ func (c *Catalog) HideVideo(ctx context.Context, id string) error {

 // MigrateVideoToDrive 把 catalog 里 id=videoID 这条视频迁移到另一个 drive。
 // 用于 spider91 → PikPak 的迁移：上传成功后改写 drive_id / file_id /
-// content_hash，保留视频自身的 id（spider91-<driveID>-<viewkey>），这样
+// content_hash，保留视频自身的 id（spider91-<driveID>-<sourceID>），这样
 // 关联表 (video_tags / 收藏 / 点赞) 都不需要动。
 //
 // scanner 后续看到 PikPak 目录下相同 hash / file_name 的文件时，会通过
@@ -555,7 +555,7 @@ func (c *Catalog) ListVideosByDrive(ctx context.Context, driveID string) ([]*Vid
 }

 // ListVideoFileIDsByDrive 只返回某 drive 下所有视频的 file_id 集合，
-// 比 ListVideosByDrive 轻量，spider91 crawler 用它把已知 viewkey 列表喂给 python 脚本。
+// 比 ListVideosByDrive 轻量。
 func (c *Catalog) ListVideoFileIDsByDrive(ctx context.Context, driveID string) ([]string, error) {
 	rows, err := c.db.QueryContext(ctx,
 		`SELECT file_id FROM videos WHERE drive_id = ? AND file_id != ''`,
@@ -575,16 +575,17 @@ func (c *Catalog) ListVideoFileIDsByDrive(ctx context.Context, driveID string) (
 	return out, rows.Err()
 }

-// ListSpider91Viewkeys 列出某个 spider91 drive 历史上爬过的所有 viewkey。
+// ListSpider91Viewkeys 列出某个 spider91 drive 历史上爬过的所有 ID 后缀。
+// 函数名保留历史叫法；新 spider91 数据的后缀是 91 mp4 源 ID，不再是 viewkey。
 //
 // 不能再用 ListVideoFileIDsByDrive：那个只看 drive_id，但 spider91 视频
 // 一旦被 spider91migrate 迁移到 PikPak，drive_id 就变成 PikPak 了。
 //
 // 这里按 video.id 前缀 "spider91-<driveID>-" 查，即使迁移后视频也仍能被
-// 找到——id 本身永远是 "spider91-<driveID>-<viewkey>"。
+// 找到——id 本身会保留 "spider91-<driveID>-<sourceID>" 这个来源前缀。
 //
-// 用途：crawler 把这个集合写到 seen 文件，让 Python 跳过已爬过的 viewkey
-// 详情页，配合 --target-new 真正凑出 N 个未爬过的视频。
+// 用途：crawler 把这个集合写到 seen 文件，让 Python/Go 跳过已爬过的视频，
+// 配合 --target-new 真正凑出 N 个未爬过的视频。
 func (c *Catalog) ListSpider91Viewkeys(ctx context.Context, driveID string) ([]string, error) {
 	prefix := "spider91-" + driveID + "-"
 	rows, err := c.db.QueryContext(ctx,
@@ -14,6 +14,7 @@ import (
 	"os/exec"
 	"path"
 	"path/filepath"
+	"regexp"
 	"strings"
 	"sync"
 	"time"
@@ -113,7 +114,7 @@ type CrawlResult struct {
 	Skipped int
 	// Failed 是下载或入库失败的条数。
 	Failed int
-	// SeenSnapshot 调用 Python 时实际写出的已知 viewkey 数量。
+	// SeenSnapshot 调用 Python 时实际写出的已知视频 ID 数量。
 	SeenSnapshot int
 	StartedAt    time.Time
 	FinishedAt   time.Time
@@ -127,11 +128,12 @@ type spiderVideoEntry struct {
 	ThumbURL  string `json:"thumb_url"`
 	VideoURL  string `json:"video_url"`
 	Viewkey   string `json:"viewkey"`
+	SourceID  string `json:"source_id"`
 	DetailURL string `json:"detail_url"`
 }

 // RunOnce 执行一次"跑爬虫 → 下载 → 入库"流程：
-//  1. 从 catalog 拉取本 drive 已存在的 viewkey 列表，写到临时文件
+//  1. 从 catalog 拉取本 drive 已存在的 91 源视频 ID 列表，写到临时文件
 //  2. 启动 Python 爬虫（--target-new + --seen-viewkeys-file + --stream-output），
 //     Python 每解析出一个 video 直链就把 entry 当作一行 JSON 写到 stdout。
 //  3. Go 端 bufio.Scanner 按行读：每行立即下载视频和封面、入库。
@@ -168,7 +170,7 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err
 	result := &CrawlResult{TargetNew: targetNew, StartedAt: time.Now()}
 	defer func() { result.FinishedAt = time.Now() }()

-	// 1. 准备 .crawl/ 目录 + 已知 viewkey 列表
+	// 1. 准备 .crawl/ 目录 + 已知源视频 ID 列表
 	//
 	// 关键：路径必须用绝对路径，因为 Python 子进程的 cwd 我们设成了脚本所在目录
 	// （为了让 Python 用 site-packages 里的 requests 等），传相对路径会被 Python
@@ -222,8 +224,8 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err
 			continue
 		}
 		result.TotalEntries++
-		viewkey := strings.TrimSpace(item.Viewkey)
-		if viewkey == "" || strings.TrimSpace(item.VideoURL) == "" {
+		sourceID := sourceIDForItem(item)
+		if sourceID == "" || strings.TrimSpace(item.VideoURL) == "" {
 			result.Failed++
 			continue
 		}
@@ -231,13 +233,13 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err
 			// Python 侧已用 target_new 控制；这里再兜底防止脚本异常多输出
 			break
 		}
-		videoID := buildVideoID(c.cfg.Driver.ID(), viewkey)
+		videoID := buildVideoID(c.cfg.Driver.ID(), sourceID)
 		if existing, _ := c.cfg.Catalog.GetVideo(ctx, videoID); existing != nil {
 			result.Skipped++
 			continue
 		}
 		if perr := c.processOne(ctx, videoID, item); perr != nil {
-			log.Printf("[spider91] drive=%s viewkey=%s failed: %v", c.cfg.Driver.ID(), viewkey, perr)
+			log.Printf("[spider91] drive=%s viewkey=%s source_id=%s failed: %v", c.cfg.Driver.ID(), item.Viewkey, sourceID, perr)
 			result.Failed++
 			continue
 		}
@@ -256,24 +258,24 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err
 	return result, nil
 }

-// writeSeenViewkeys 把当前 drive 下已入库的 viewkey 写到 path，供 Python 脚本读取。
+// writeSeenViewkeys 把当前 drive 下已入库的 91 源视频 ID 写到 path，供 Python 脚本读取。
 //
 // 注意：不能用 ListVideoFileIDsByDrive（按 drive_id 查），因为 spider91
 // 视频被 spider91migrate 迁移到 PikPak 后 drive_id 已经不再是这个 drive。
 // 改用 ListSpider91Viewkeys：它按 video.id 前缀（"spider91-<driveID>-"）查，
-// 不受迁移影响——id 永远是 spider91-<driveID>-<viewkey>。
+// 不受迁移影响。函数名保留历史叫法，实际返回的是 ID 后缀；新数据使用 mp4 源 ID。
 func (c *Crawler) writeSeenViewkeys(ctx context.Context, path string) (int, error) {
-	viewkeys, err := c.cfg.Catalog.ListSpider91Viewkeys(ctx, c.cfg.Driver.ID())
+	seenIDs, err := c.cfg.Catalog.ListSpider91Viewkeys(ctx, c.cfg.Driver.ID())
 	if err != nil {
 		return 0, err
 	}
-	seen := make(map[string]struct{}, len(viewkeys))
-	for _, vk := range viewkeys {
-		vk = strings.TrimSpace(vk)
-		if vk == "" {
+	seen := make(map[string]struct{}, len(seenIDs))
+	for _, id := range seenIDs {
+		id = strings.TrimSpace(id)
+		if id == "" {
 			continue
 		}
-		seen[vk] = struct{}{}
+		seen[id] = struct{}{}
 	}

 	tmp := path + ".part"
@@ -281,8 +283,8 @@ func (c *Crawler) writeSeenViewkeys(ctx context.Context, path string) (int, erro
 	if err != nil {
 		return 0, err
 	}
-	for viewkey := range seen {
-		if _, err := f.WriteString(viewkey + "\n"); err != nil {
+	for id := range seen {
+		if _, err := f.WriteString(id + "\n"); err != nil {
 			_ = f.Close()
 			_ = os.Remove(tmp)
 			return 0, err
@@ -356,15 +358,30 @@ func forwardSpiderLog(driveID string, r io.Reader) {
 	}
 }

-// processOne 处理单个 viewkey：下载视频 + 封面 + 复制封面 + 入库。
+// processOne 处理单个 91 源视频：下载视频 + 封面 + 复制封面 + 入库。
 // 任一步失败会清理已写入的临时文件，不留半成品。
 func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVideoEntry) error {
 	viewkey := item.Viewkey
+	sourceID := sourceIDForItem(item)
+	if sourceID == "" {
+		return errors.New("empty numeric source id")
+	}
+
+	videoURL := strings.TrimSpace(item.VideoURL)
+	videoSourceID := sourceIDFromVideoURL(videoURL)
+	if videoSourceID == "" {
+		return fmt.Errorf("video url has no numeric source id: %s", videoURL)
+	}
+	if videoSourceID != sourceID {
+		return fmt.Errorf("video source id mismatch: got %s want %s", videoSourceID, sourceID)
+	}
+	thumbURL := normalizeThumbURLForSource(item.ThumbURL, sourceID)
+
 	// 视频文件后缀按直链 URL 真实后缀来定，避免直链返回的不是 mp4 时存错容器。
-	videoExt := detectVideoExt(item.VideoURL)
-	videoFile := viewkey + videoExt
+	videoExt := detectVideoExt(videoURL)
+	videoFile := sourceID + videoExt
 	// 封面后缀同理，但 91porn 的封面绝大多数是 jpg；URL 提示其它格式时尊重之。
-	thumbFile := viewkey + detectThumbExt(item.ThumbURL)
+	thumbFile := sourceID + detectThumbExt(thumbURL)

 	videoPath, err := c.cfg.Driver.VideoPath(videoFile)
 	if err != nil {
@@ -376,10 +393,7 @@ func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVid
 	}

 	// 视频先下载（必须）；失败直接退出。
-	dlCtx, cancel := context.WithTimeout(ctx, c.cfg.DownloadTimeout)
-	defer cancel()
-
-	videoSize, err := c.downloadAtomic(dlCtx, item.VideoURL, videoPath, item.DetailURL)
+	videoSize, err := c.downloadVideoAtomicWithRefresh(ctx, item, videoPath, videoURL, sourceID)
 	if err != nil {
 		return fmt.Errorf("download video: %w", err)
 	}
@@ -388,9 +402,12 @@ func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVid
 	// thumbnail_status 显式标 'failed'（spider91 drive 的 thumb worker 按设计
 	// 不处理 spider91 视频，没人能"兜底"）。
 	thumbReady := false
-	if strings.TrimSpace(item.ThumbURL) != "" {
-		if _, err := c.downloadAtomic(dlCtx, item.ThumbURL, thumbPath, item.DetailURL); err != nil {
-			log.Printf("[spider91] drive=%s viewkey=%s thumb download failed: %v", c.cfg.Driver.ID(), viewkey, err)
+	if strings.TrimSpace(thumbURL) != "" {
+		thumbCtx, cancel := c.downloadAttemptContext(ctx)
+		_, err := c.downloadAtomic(thumbCtx, thumbURL, thumbPath, item.DetailURL)
+		cancel()
+		if err != nil {
+			log.Printf("[spider91] drive=%s viewkey=%s source_id=%s thumb download failed: %v", c.cfg.Driver.ID(), viewkey, sourceID, err)
 		} else {
 			thumbReady = true
 		}
@@ -404,7 +421,7 @@ func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVid
 		} else {
 			dst := filepath.Join(c.cfg.CommonThumbDir, videoID+".jpg")
 			if err := copyFileAtomic(thumbPath, dst); err != nil {
-				log.Printf("[spider91] drive=%s viewkey=%s copy thumb to common dir: %v", c.cfg.Driver.ID(), viewkey, err)
+				log.Printf("[spider91] drive=%s viewkey=%s source_id=%s copy thumb to common dir: %v", c.cfg.Driver.ID(), viewkey, sourceID, err)
 				thumbReady = false
 			}
 		}
@@ -429,7 +446,7 @@ func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVid
 		UpdatedAt:     now,
 	}
 	if v.Title == "" {
-		v.Title = viewkey
+		v.Title = sourceID
 	}
 	if thumbReady {
 		// 设了 ThumbnailURL 后 thumb worker 会跳过这条视频，
@@ -455,10 +472,56 @@ func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVid
 	if c.cfg.OnNewVideo != nil {
 		c.cfg.OnNewVideo(v)
 	}
-	log.Printf("[spider91] drive=%s viewkey=%s ok title=%q size=%d", c.cfg.Driver.ID(), viewkey, v.Title, v.Size)
+	log.Printf("[spider91] drive=%s viewkey=%s source_id=%s ok title=%q size=%d", c.cfg.Driver.ID(), viewkey, sourceID, v.Title, v.Size)
 	return nil
 }

+func (c *Crawler) downloadVideoAtomicWithRefresh(ctx context.Context, item spiderVideoEntry, dst, firstURL, expectedSourceID string) (int64, error) {
+	videoURL := strings.TrimSpace(firstURL)
+	if videoURL == "" {
+		videoURL = strings.TrimSpace(item.VideoURL)
+	}
+	var lastErr error
+	for attempt := 1; attempt <= 3; attempt++ {
+		attemptCtx, cancel := c.downloadAttemptContext(ctx)
+		size, err := c.downloadAtomic(attemptCtx, videoURL, dst, item.DetailURL)
+		cancel()
+		if err == nil {
+			return size, nil
+		}
+		lastErr = err
+		if ctx.Err() != nil || !shouldRefreshSpider91VideoURL(err) {
+			return 0, err
+		}
+		fresh, refreshErr := c.resolveFreshVideoURL(ctx, item)
+		if refreshErr != nil {
+			return 0, fmt.Errorf("%w; refresh video url: %v", err, refreshErr)
+		}
+		if fresh == "" || fresh == videoURL {
+			return 0, err
+		}
+		freshSourceID := sourceIDFromVideoURL(fresh)
+		if freshSourceID == "" {
+			return 0, fmt.Errorf("%w; refreshed video url has no numeric source id: %s", err, fresh)
+		}
+		if expectedSourceID != "" && freshSourceID != expectedSourceID {
+			return 0, fmt.Errorf("%w; refreshed video source id mismatch: got %s want %s", err, freshSourceID, expectedSourceID)
+		}
+		_ = os.Remove(dst + ".part")
+		log.Printf("[spider91] drive=%s viewkey=%s source_id=%s download attempt=%d failed (%v); refreshed video url and retrying",
+			c.cfg.Driver.ID(), item.Viewkey, expectedSourceID, attempt, err)
+		videoURL = fresh
+	}
+	return 0, lastErr
+}
+
+func (c *Crawler) downloadAttemptContext(ctx context.Context) (context.Context, context.CancelFunc) {
+	if c.cfg.DownloadTimeout <= 0 {
+		return ctx, func() {}
+	}
+	return context.WithTimeout(ctx, c.cfg.DownloadTimeout)
+}
+
 // downloadAtomic 下载 url 到 dst，先写到 dst.part 再 rename，避免半截文件。
 // 返回最终文件大小。
 func (c *Crawler) downloadAtomic(ctx context.Context, src, dst, referer string) (int64, error) {
@@ -487,7 +550,7 @@ func (c *Crawler) downloadAtomic(ctx context.Context, src, dst, referer string)
 	}
 	defer resp.Body.Close()
 	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
-		return 0, fmt.Errorf("http %d", resp.StatusCode)
+		return 0, &downloadHTTPError{StatusCode: resp.StatusCode}
 	}

 	tmp := dst + ".part"
@@ -516,6 +579,289 @@ func (c *Crawler) downloadAtomic(ctx context.Context, src, dst, referer string)
 	return written, nil
 }

+type downloadHTTPError struct {
+	StatusCode int
+}
+
+func (e *downloadHTTPError) Error() string {
+	if e == nil {
+		return "http error"
+	}
+	return fmt.Sprintf("http %d", e.StatusCode)
+}
+
+func shouldRefreshSpider91VideoURL(err error) bool {
+	if err == nil {
+		return false
+	}
+	if errors.Is(err, context.Canceled) {
+		return false
+	}
+	if errors.Is(err, context.DeadlineExceeded) {
+		return true
+	}
+	var httpErr *downloadHTTPError
+	if errors.As(err, &httpErr) {
+		switch httpErr.StatusCode {
+		case http.StatusForbidden, http.StatusNotFound, http.StatusGone, http.StatusRequestedRangeNotSatisfiable,
+			http.StatusTooManyRequests, http.StatusInternalServerError, http.StatusBadGateway, http.StatusServiceUnavailable, http.StatusGatewayTimeout:
+			return true
+		default:
+			return false
+		}
+	}
+	text := strings.ToLower(err.Error())
+	return strings.Contains(text, "unexpected eof") ||
+		strings.Contains(text, "connection reset") ||
+		strings.Contains(text, "connection refused") ||
+		strings.Contains(text, "broken pipe") ||
+		strings.Contains(text, "server closed") ||
+		strings.Contains(text, "timeout")
+}
+
+func (c *Crawler) resolveFreshVideoURL(ctx context.Context, item spiderVideoEntry) (string, error) {
+	detailURL := strings.TrimSpace(item.DetailURL)
+	if detailURL == "" {
+		return "", errors.New("empty detail url")
+	}
+	cookieHeader := "mode=d"
+	if warmURL := spider91ListURLForDetail(detailURL); warmURL != "" {
+		if cookies, err := c.fetchSpider91WarmCookies(ctx, warmURL, detailURL); err == nil {
+			cookieHeader = spider91CookieHeader(cookies)
+		} else {
+			log.Printf("[spider91] drive=%s viewkey=%s warm session failed: %v", c.cfg.Driver.ID(), item.Viewkey, err)
+		}
+	}
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, detailURL, nil)
+	if err != nil {
+		return "", err
+	}
+	req.Header.Set("User-Agent", downloadUA)
+	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
+	req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9")
+	req.Header.Set("Cookie", cookieHeader)
+	resp, err := c.cfg.HTTPClient.Do(req)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		return "", &downloadHTTPError{StatusCode: resp.StatusCode}
+	}
+	body, err := io.ReadAll(io.LimitReader(resp.Body, 4*1024*1024))
+	if err != nil {
+		return "", err
+	}
+	videoURL := parseSpider91VideoURL(string(body))
+	if videoURL == "" {
+		return "", errors.New("video url not found in detail page")
+	}
+	return videoURL, nil
+}
+
+func (c *Crawler) fetchSpider91WarmCookies(ctx context.Context, warmURL, referer string) ([]*http.Cookie, error) {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, warmURL, nil)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("User-Agent", downloadUA)
+	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
+	req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9")
+	req.Header.Set("Cookie", "mode=d")
+	if referer != "" {
+		req.Header.Set("Referer", referer)
+	}
+	resp, err := c.cfg.HTTPClient.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+	_, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 64*1024))
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		return nil, &downloadHTTPError{StatusCode: resp.StatusCode}
+	}
+	return resp.Cookies(), nil
+}
+
+func spider91ListURLForDetail(detailURL string) string {
+	u, err := url.Parse(strings.TrimSpace(detailURL))
+	if err != nil || u == nil || u.Scheme == "" || u.Host == "" {
+		return ""
+	}
+	if !strings.Contains(strings.ToLower(u.Host), "91porn.com") {
+		return ""
+	}
+	q := u.Query()
+	page := strings.TrimSpace(q.Get("page"))
+	category := strings.TrimSpace(q.Get("category"))
+	viewtype := strings.TrimSpace(q.Get("viewtype"))
+	if page == "" || category == "" || viewtype == "" {
+		return ""
+	}
+	listURL := *u
+	listURL.Path = "/v.php"
+	listQuery := url.Values{}
+	listQuery.Set("category", category)
+	listQuery.Set("viewtype", viewtype)
+	listQuery.Set("page", page)
+	listURL.RawQuery = listQuery.Encode()
+	listURL.Fragment = ""
+	return listURL.String()
+}
+
+func spider91CookieHeader(cookies []*http.Cookie) string {
+	values := []string{"mode=d"}
+	seen := map[string]bool{"mode": true}
+	for _, cookie := range cookies {
+		if cookie == nil || strings.TrimSpace(cookie.Name) == "" || seen[cookie.Name] {
+			continue
+		}
+		seen[cookie.Name] = true
+		values = append(values, cookie.Name+"="+cookie.Value)
+	}
+	return strings.Join(values, "; ")
+}
+
+var (
+	strencode2RE = regexp.MustCompile(`strencode2\(["']([^"']+)["']\)`)
+	srcAttrRE    = regexp.MustCompile(`src=['"]([^'"]+)['"]`)
+	mp4URLRE     = regexp.MustCompile(`https?://[^\s"'<>]+\.mp4[^\s"'<>]*`)
+)
+
+func parseSpider91VideoURL(html string) string {
+	if html == "" {
+		return ""
+	}
+	if match := strencode2RE.FindStringSubmatch(html); len(match) == 2 {
+		if decoded, err := url.PathUnescape(match[1]); err == nil {
+			if src := srcAttrRE.FindStringSubmatch(decoded); len(src) == 2 {
+				return normalizeHTTPURLSlashes(src[1])
+			}
+		}
+	}
+	if match := mp4URLRE.FindString(html); match != "" {
+		lower := strings.ToLower(match)
+		if !strings.Contains(lower, "kwai") && !strings.Contains(lower, "ad-") {
+			return match
+		}
+	}
+	return ""
+}
+
+func normalizeHTTPURLSlashes(rawURL string) string {
+	u, err := url.Parse(strings.TrimSpace(rawURL))
+	if err != nil || u == nil || u.Scheme == "" || u.Host == "" {
+		return rawURL
+	}
+	for strings.Contains(u.Path, "//") {
+		u.Path = strings.ReplaceAll(u.Path, "//", "/")
+	}
+	return u.String()
+}
+
+func sourceIDForItem(item spiderVideoEntry) string {
+	if id := sanitizeSourceID(item.SourceID); isNumericSourceID(id) {
+		return id
+	}
+	if id := sourceIDFromVideoURL(item.VideoURL); id != "" {
+		return id
+	}
+	if id := sourceIDFromThumbURL(item.ThumbURL); id != "" {
+		return id
+	}
+	return ""
+}
+
+func sourceIDFromVideoURL(rawURL string) string {
+	u, err := url.Parse(strings.TrimSpace(rawURL))
+	if err != nil || u == nil {
+		return ""
+	}
+	base := path.Base(u.Path)
+	ext := strings.ToLower(path.Ext(base))
+	switch ext {
+	case ".mp4", ".m4v", ".mov", ".webm", ".mkv", ".avi", ".flv":
+	default:
+		return ""
+	}
+	id := sanitizeSourceID(strings.TrimSuffix(base, ext))
+	if !isNumericSourceID(id) {
+		return ""
+	}
+	return id
+}
+
+func sourceIDFromThumbURL(rawURL string) string {
+	u, err := url.Parse(strings.TrimSpace(rawURL))
+	if err != nil || u == nil {
+		return ""
+	}
+	base := path.Base(u.Path)
+	ext := strings.ToLower(path.Ext(base))
+	switch ext {
+	case ".jpg", ".jpeg", ".png", ".webp", ".gif":
+	default:
+		return ""
+	}
+	id := sanitizeSourceID(strings.TrimSuffix(base, ext))
+	if !isNumericSourceID(id) {
+		return ""
+	}
+	return id
+}
+
+func sanitizeSourceID(raw string) string {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return ""
+	}
+	var b strings.Builder
+	for _, r := range raw {
+		if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' || r == '-' {
+			b.WriteRune(r)
+		}
+	}
+	return b.String()
+}
+
+func isNumericSourceID(id string) bool {
+	if id == "" {
+		return false
+	}
+	for _, r := range id {
+		if r < '0' || r > '9' {
+			return false
+		}
+	}
+	return true
+}
+
+func normalizeThumbURLForSource(rawURL, sourceID string) string {
+	sourceID = sanitizeSourceID(sourceID)
+	if strings.TrimSpace(rawURL) == "" || sourceID == "" {
+		return rawURL
+	}
+	u, err := url.Parse(strings.TrimSpace(rawURL))
+	if err != nil || u == nil || u.Scheme == "" || u.Host == "" {
+		return rawURL
+	}
+	base := path.Base(u.Path)
+	ext := strings.ToLower(path.Ext(base))
+	switch ext {
+	case ".jpg", ".jpeg", ".png", ".webp", ".gif":
+	default:
+		return rawURL
+	}
+	dir := path.Dir(u.Path)
+	if dir == "." || dir == "/" || !strings.HasSuffix(dir, "/thumb") {
+		return rawURL
+	}
+	u.Path = path.Join(dir, sourceID+".jpg")
+	u.RawQuery = ""
+	u.Fragment = ""
+	return u.String()
+}
+
 // copyFileAtomic 把 src 复制到 dst，先写 .part 再 rename。
 func copyFileAtomic(src, dst string) error {
 	in, err := os.Open(src)
@@ -543,14 +889,14 @@ func copyFileAtomic(src, dst string) error {
 	return os.Rename(tmp, dst)
 }

-// BuildVideoID 给定 driveID + viewkey，按统一规则生成 catalog 中 videos.id。
+// BuildVideoID 给定 driveID + 91 源视频 ID，按统一规则生成 catalog 中 videos.id。
 // 与 scanner 用法一致：<kind>-<driveID>-<fileID>。
-func BuildVideoID(driveID, viewkey string) string {
-	return buildVideoID(driveID, viewkey)
+func BuildVideoID(driveID, sourceID string) string {
+	return buildVideoID(driveID, sourceID)
 }

-func buildVideoID(driveID, viewkey string) string {
-	return Kind + "-" + driveID + "-" + viewkey
+func buildVideoID(driveID, sourceID string) string {
+	return Kind + "-" + driveID + "-" + sourceID
 }

 // detectVideoExt 从直链 URL 推断视频文件后缀。
@@ -5,10 +5,12 @@ import (
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
+	"net/url"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strings"
+	"sync/atomic"
 	"testing"
 	"time"

@@ -17,7 +19,7 @@ import (

 // TestCrawlerRunOnceFullFlow 用一个伪 python 脚本 + httptest 服务器
 // 把 Crawler.RunOnce 的完整流程跑一遍：脚本生成 JSON、下载视频和封面、入库、
-// 重复运行跳过已存在的 viewkey。
+// 重复运行跳过已存在的 91 源视频 ID。
 func TestCrawlerRunOnceFullFlow(t *testing.T) {
 	if runtime.GOOS == "windows" {
 		t.Skip("shell-based fake script only on unix")
@@ -28,16 +30,16 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
 	// 1. 假 HTTP 服务器：根据路径返回视频数据或封面数据
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		switch {
-		case strings.Contains(r.URL.Path, "video1.mp4"):
+		case strings.Contains(r.URL.Path, "120001.mp4"):
 			w.Header().Set("Content-Type", "video/mp4")
 			_, _ = w.Write([]byte("FAKEVIDEO1"))
-		case strings.Contains(r.URL.Path, "video2.mp4"):
+		case strings.Contains(r.URL.Path, "120002.mp4"):
 			w.Header().Set("Content-Type", "video/mp4")
 			_, _ = w.Write([]byte("FAKEVIDEO2BYTES"))
-		case strings.Contains(r.URL.Path, "thumb1.jpg"):
+		case strings.Contains(r.URL.Path, "/thumb/120001.jpg"):
 			w.Header().Set("Content-Type", "image/jpeg")
 			_, _ = w.Write([]byte("\xff\xd8\xff\xe0fakejpg1"))
-		case strings.Contains(r.URL.Path, "thumb2.jpg"):
+		case strings.Contains(r.URL.Path, "/thumb/120002.jpg"):
 			w.Header().Set("Content-Type", "image/jpeg")
 			_, _ = w.Write([]byte("\xff\xd8\xff\xe0fakejpg2"))
 		default:
@@ -52,15 +54,15 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
 	videoEntries := []map[string]string{
 		{
 			"title":      "Video One",
-			"thumb_url":  srv.URL + "/thumbs/thumb1.jpg",
-			"video_url":  srv.URL + "/videos/video1.mp4",
+			"thumb_url":  srv.URL + "/thumb/not-120001.jpg",
+			"video_url":  srv.URL + "/videos/120001.mp4",
 			"viewkey":    "vk-001",
 			"detail_url": srv.URL + "/v.php?viewkey=vk-001",
 		},
 		{
 			"title":      "Video Two",
-			"thumb_url":  srv.URL + "/thumbs/thumb2.jpg",
-			"video_url":  srv.URL + "/videos/video2.mp4",
+			"thumb_url":  srv.URL + "/thumb/not-120002.jpg",
+			"video_url":  srv.URL + "/videos/120002.mp4",
 			"viewkey":    "vk-002",
 			"detail_url": srv.URL + "/v.php?viewkey=vk-002",
 		},
@@ -128,29 +130,28 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {

 	// 5. 检查文件落盘
 	for _, item := range []struct {
-		viewkey  string
+		sourceID string
 		size     int64
-		thumbLen int
 	}{
-		{"vk-001", 10, 11},
-		{"vk-002", 15, 11},
+		{"120001", 10},
+		{"120002", 15},
 	} {
-		videoPath := filepath.Join(rootDir, "videos", item.viewkey+".mp4")
+		videoPath := filepath.Join(rootDir, "videos", item.sourceID+".mp4")
 		info, err := os.Stat(videoPath)
 		if err != nil {
-			t.Fatalf("video %s missing: %v", item.viewkey, err)
+			t.Fatalf("video %s missing: %v", item.sourceID, err)
 		}
 		if info.Size() != item.size {
-			t.Fatalf("video %s size = %d, want %d", item.viewkey, info.Size(), item.size)
+			t.Fatalf("video %s size = %d, want %d", item.sourceID, info.Size(), item.size)
 		}

-		thumbPath := filepath.Join(rootDir, "thumbs", item.viewkey+".jpg")
+		thumbPath := filepath.Join(rootDir, "thumbs", item.sourceID+".jpg")
 		if _, err := os.Stat(thumbPath); err != nil {
-			t.Fatalf("thumb %s missing: %v", item.viewkey, err)
+			t.Fatalf("thumb %s missing: %v", item.sourceID, err)
 		}

 		// 复制到 common thumbs 目录的副本，名字按 videoID 来
-		videoID := BuildVideoID(driveID, item.viewkey)
+		videoID := BuildVideoID(driveID, item.sourceID)
 		commonThumb := filepath.Join(commonThumbs, videoID+".jpg")
 		if _, err := os.Stat(commonThumb); err != nil {
 			t.Fatalf("common thumb %s missing: %v", commonThumb, err)
@@ -158,8 +159,8 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
 	}

 	// 6. 检查 catalog 入库
-	for _, viewkey := range []string{"vk-001", "vk-002"} {
-		videoID := BuildVideoID(driveID, viewkey)
+	for _, sourceID := range []string{"120001", "120002"} {
+		videoID := BuildVideoID(driveID, sourceID)
 		v, err := cat.GetVideo(context.Background(), videoID)
 		if err != nil {
 			t.Fatalf("GetVideo %s: %v", videoID, err)
@@ -167,8 +168,8 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
 		if v.DriveID != driveID {
 			t.Fatalf("video %s drive_id = %q want %q", videoID, v.DriveID, driveID)
 		}
-		if v.FileID != viewkey+".mp4" {
-			t.Fatalf("video %s file_id = %q want %q", videoID, v.FileID, viewkey+".mp4")
+		if v.FileID != sourceID+".mp4" {
+			t.Fatalf("video %s file_id = %q want %q", videoID, v.FileID, sourceID+".mp4")
 		}
 		if v.ThumbnailURL == "" {
 			t.Fatalf("video %s ThumbnailURL empty (cover should be ready)", videoID)
@@ -189,7 +190,7 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
 		}
 	}

-	// 7. 第二次 RunOnce：viewkey 已存在 → 全部 skipped，无新文件下载
+	// 7. 第二次 RunOnce：源视频 ID 已存在 → 全部 skipped，无新文件下载
 	newVideos = nil
 	res2, err := c.RunOnce(context.Background(), 15)
 	if err != nil {
@@ -201,7 +202,7 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
 	if res2.Skipped != 2 {
 		t.Fatalf("second run Skipped = %d, want 2", res2.Skipped)
 	}
-	// 第二次运行时 catalog 里已经有 2 条，seen snapshot 应该写出 2 个 viewkey
+	// 第二次运行时 catalog 里已经有 2 条，seen snapshot 应该写出 2 个源视频 ID
 	if res2.SeenSnapshot != 2 {
 		t.Fatalf("second run SeenSnapshot = %d, want 2", res2.SeenSnapshot)
 	}
@@ -249,10 +250,10 @@ func TestCrawlerThumbDownloadFailureMarksStatusFailed(t *testing.T) {
 	// 假 HTTP 服务器：thumb 路径返回 500，video 正常返回字节。
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		switch {
-		case strings.Contains(r.URL.Path, "video.mp4"):
+		case strings.Contains(r.URL.Path, "120101.mp4"):
 			w.Header().Set("Content-Type", "video/mp4")
 			_, _ = w.Write([]byte("FAKEVIDEO"))
-		case strings.Contains(r.URL.Path, "thumb.jpg"):
+		case strings.Contains(r.URL.Path, "120101.jpg"):
 			http.Error(w, "broken", http.StatusInternalServerError)
 		default:
 			http.NotFound(w, r)
@@ -263,8 +264,8 @@ func TestCrawlerThumbDownloadFailureMarksStatusFailed(t *testing.T) {
 	videoEntries := []map[string]string{
 		{
 			"title":      "Thumb Failure Video",
-			"thumb_url":  srv.URL + "/thumbs/thumb.jpg",
-			"video_url":  srv.URL + "/videos/video.mp4",
+			"thumb_url":  srv.URL + "/thumb/120101.jpg",
+			"video_url":  srv.URL + "/videos/120101.mp4",
 			"viewkey":    "vk-thumb-fail",
 			"detail_url": srv.URL + "/v.php?viewkey=vk-thumb-fail",
 		},
@@ -306,7 +307,7 @@ func TestCrawlerThumbDownloadFailureMarksStatusFailed(t *testing.T) {
 		t.Fatalf("expected 1 new video, got %d (failed=%d)", res.NewVideos, res.Failed)
 	}

-	got, err := cat.GetVideo(context.Background(), "spider91-"+driveID+"-vk-thumb-fail")
+	got, err := cat.GetVideo(context.Background(), "spider91-"+driveID+"-120101")
 	if err != nil {
 		t.Fatalf("get video: %v", err)
 	}
@@ -327,6 +328,297 @@ func TestCrawlerThumbDownloadFailureMarksStatusFailed(t *testing.T) {
 	}
 }

+func TestCrawlerUsesCrawlerVideoURLForFirstDownload(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("shell-based fake script only on unix")
+	}
+	tmp := t.TempDir()
+
+	var detailRequests int32
+	var originalRequests int32
+	var wrongRequests int32
+	var srv *httptest.Server
+	srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.URL.Path == "/v.php":
+			atomic.AddInt32(&detailRequests, 1)
+			_, _ = w.Write([]byte(spider91DetailHTML(srv.URL + "/videos/856305.mp4?token=wrong")))
+		case r.URL.Path == "/videos/120201.mp4" && r.URL.Query().Get("token") == "original":
+			atomic.AddInt32(&originalRequests, 1)
+			w.Header().Set("Content-Type", "video/mp4")
+			_, _ = w.Write([]byte("ORIGINALVIDEO"))
+		case r.URL.Path == "/videos/856305.mp4":
+			atomic.AddInt32(&wrongRequests, 1)
+			w.Header().Set("Content-Type", "video/mp4")
+			_, _ = w.Write([]byte("WRONGVIDEO"))
+		case r.URL.Path == "/thumb/120201.jpg":
+			w.Header().Set("Content-Type", "image/jpeg")
+			_, _ = w.Write([]byte("\xff\xd8\xff\xe0thumb"))
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+
+	entry := map[string]string{
+		"title":      "Use Original URL First",
+		"thumb_url":  srv.URL + "/thumb/wrong-thumb.jpg",
+		"video_url":  srv.URL + "/videos/120201.mp4?token=original",
+		"viewkey":    "vk-use-original",
+		"detail_url": srv.URL + "/v.php?viewkey=vk-use-original",
+	}
+	cat, drv, scriptPath := seedCrawlerTestDeps(t, tmp, "use-original-drive", []map[string]string{entry})
+	c := NewCrawler(CrawlerConfig{
+		Driver:          drv,
+		Catalog:         cat,
+		PythonPath:      "sh",
+		ScriptPath:      scriptPath,
+		CommonThumbDir:  filepath.Join(tmp, "previews", "thumbs"),
+		SpiderTimeout:   10 * time.Second,
+		DownloadTimeout: 10 * time.Second,
+	})
+
+	res, err := c.RunOnce(context.Background(), 1)
+	if err != nil {
+		t.Fatalf("RunOnce: %v", err)
+	}
+	if res.NewVideos != 1 || res.Failed != 0 {
+		t.Fatalf("result new=%d failed=%d, want 1/0", res.NewVideos, res.Failed)
+	}
+	if got := atomic.LoadInt32(&detailRequests); got != 0 {
+		t.Fatalf("detail requests = %d, want 0 (first download should use crawler URL)", got)
+	}
+	if got := atomic.LoadInt32(&originalRequests); got != 1 {
+		t.Fatalf("original URL requests = %d, want 1", got)
+	}
+	if got := atomic.LoadInt32(&wrongRequests); got != 0 {
+		t.Fatalf("wrong source URL requests = %d, want 0", got)
+	}
+	info, err := os.Stat(filepath.Join(drv.RootDir(), "videos", "120201.mp4"))
+	if err != nil {
+		t.Fatalf("original video missing: %v", err)
+	}
+	if info.Size() != int64(len("ORIGINALVIDEO")) {
+		t.Fatalf("original video size = %d, want %d", info.Size(), len("ORIGINALVIDEO"))
+	}
+}
+
+func TestCrawlerRefreshesVideoURLAfterExpiredDownload(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("shell-based fake script only on unix")
+	}
+	tmp := t.TempDir()
+
+	var detailRequests int32
+	var staleRequests int32
+	var freshRequests int32
+	var srv *httptest.Server
+	srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.URL.Path == "/v.php":
+			n := atomic.AddInt32(&detailRequests, 1)
+			videoURL := srv.URL + "/videos/120202.mp4?token=stale"
+			if n > 1 {
+				videoURL = srv.URL + "/videos/120202.mp4?token=fresh"
+			}
+			_, _ = w.Write([]byte(spider91DetailHTML(videoURL)))
+		case r.URL.Path == "/videos/120202.mp4" && r.URL.Query().Get("token") == "stale":
+			atomic.AddInt32(&staleRequests, 1)
+			http.Error(w, "expired", http.StatusForbidden)
+		case r.URL.Path == "/videos/120202.mp4" && r.URL.Query().Get("token") == "fresh":
+			atomic.AddInt32(&freshRequests, 1)
+			w.Header().Set("Content-Type", "video/mp4")
+			_, _ = w.Write([]byte("REFRESHEDVIDEO"))
+		case r.URL.Path == "/thumb/120202.jpg":
+			w.Header().Set("Content-Type", "image/jpeg")
+			_, _ = w.Write([]byte("\xff\xd8\xff\xe0thumb"))
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+
+	entry := map[string]string{
+		"title":      "Refresh After Expired Download",
+		"thumb_url":  srv.URL + "/thumb/wrong-thumb.jpg",
+		"video_url":  srv.URL + "/videos/120202.mp4?token=old",
+		"viewkey":    "vk-refresh-after",
+		"detail_url": srv.URL + "/v.php?viewkey=vk-refresh-after",
+	}
+	cat, drv, scriptPath := seedCrawlerTestDeps(t, tmp, "refresh-after-drive", []map[string]string{entry})
+	c := NewCrawler(CrawlerConfig{
+		Driver:          drv,
+		Catalog:         cat,
+		PythonPath:      "sh",
+		ScriptPath:      scriptPath,
+		CommonThumbDir:  filepath.Join(tmp, "previews", "thumbs"),
+		SpiderTimeout:   10 * time.Second,
+		DownloadTimeout: 10 * time.Second,
+	})
+
+	res, err := c.RunOnce(context.Background(), 1)
+	if err != nil {
+		t.Fatalf("RunOnce: %v", err)
+	}
+	if res.NewVideos != 1 || res.Failed != 0 {
+		t.Fatalf("result new=%d failed=%d, want 1/0", res.NewVideos, res.Failed)
+	}
+	if got := atomic.LoadInt32(&detailRequests); got < 2 {
+		t.Fatalf("detail requests = %d, want at least 2 (initial refresh + retry refresh)", got)
+	}
+	if got := atomic.LoadInt32(&staleRequests); got != 1 {
+		t.Fatalf("stale URL requests = %d, want 1", got)
+	}
+	if got := atomic.LoadInt32(&freshRequests); got != 1 {
+		t.Fatalf("fresh URL requests = %d, want 1", got)
+	}
+	info, err := os.Stat(filepath.Join(drv.RootDir(), "videos", "120202.mp4"))
+	if err != nil {
+		t.Fatalf("refreshed video missing: %v", err)
+	}
+	if info.Size() != int64(len("REFRESHEDVIDEO")) {
+		t.Fatalf("refreshed video size = %d, want %d", info.Size(), len("REFRESHEDVIDEO"))
+	}
+}
+
+func TestCrawlerRejectsRefreshedSourceIDMismatch(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("shell-based fake script only on unix")
+	}
+	tmp := t.TempDir()
+
+	var srv *httptest.Server
+	srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.URL.Path == "/v.php":
+			_, _ = w.Write([]byte(spider91DetailHTML(srv.URL + "/videos/856305.mp4?token=fresh")))
+		case r.URL.Path == "/videos/1203058.mp4":
+			http.Error(w, "expired", http.StatusForbidden)
+		case r.URL.Path == "/videos/856305.mp4":
+			w.Header().Set("Content-Type", "video/mp4")
+			_, _ = w.Write([]byte("WRONGVIDEO"))
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+
+	entry := map[string]string{
+		"title":      "Source ID Mismatch",
+		"thumb_url":  srv.URL + "/thumb/1203058.jpg",
+		"video_url":  srv.URL + "/videos/1203058.mp4?token=old",
+		"viewkey":    "86fd91cce1f2e1a154cc",
+		"source_id":  "1203058",
+		"detail_url": srv.URL + "/v.php?viewkey=86fd91cce1f2e1a154cc",
+	}
+	cat, drv, scriptPath := seedCrawlerTestDeps(t, tmp, "mismatch-drive", []map[string]string{entry})
+	c := NewCrawler(CrawlerConfig{
+		Driver:          drv,
+		Catalog:         cat,
+		PythonPath:      "sh",
+		ScriptPath:      scriptPath,
+		CommonThumbDir:  filepath.Join(tmp, "previews", "thumbs"),
+		SpiderTimeout:   10 * time.Second,
+		DownloadTimeout: 10 * time.Second,
+	})
+
+	res, err := c.RunOnce(context.Background(), 1)
+	if err != nil {
+		t.Fatalf("RunOnce: %v", err)
+	}
+	if res.NewVideos != 0 || res.Failed != 1 {
+		t.Fatalf("result new=%d failed=%d, want 0/1", res.NewVideos, res.Failed)
+	}
+	if _, err := os.Stat(filepath.Join(drv.RootDir(), "videos", "1203058.mp4")); !os.IsNotExist(err) {
+		t.Fatalf("mismatched source file should not be written, stat err=%v", err)
+	}
+	if v, _ := cat.GetVideo(context.Background(), BuildVideoID(drv.ID(), "1203058")); v != nil {
+		t.Fatalf("mismatched video should not be inserted: %+v", v)
+	}
+}
+
+func TestSourceIDForItemRequiresNumericSourceID(t *testing.T) {
+	if got := sourceIDForItem(spiderVideoEntry{
+		Viewkey:  "86fd91cce1f2e1a154cc",
+		VideoURL: "https://cdn.example/videos/1203058.mp4?token=x",
+	}); got != "1203058" {
+		t.Fatalf("sourceIDForItem(video url) = %q, want 1203058", got)
+	}
+	if got := sourceIDForItem(spiderVideoEntry{
+		Viewkey:  "86fd91cce1f2e1a154cc",
+		ThumbURL: "https://img.example/thumb/1203058.jpg",
+	}); got != "1203058" {
+		t.Fatalf("sourceIDForItem(thumb url) = %q, want 1203058", got)
+	}
+	if got := sourceIDForItem(spiderVideoEntry{
+		Viewkey:  "86fd91cce1f2e1a154cc",
+		SourceID: "not-numeric",
+		VideoURL: "https://cdn.example/videos/video.mp4",
+	}); got != "" {
+		t.Fatalf("sourceIDForItem(non numeric) = %q, want empty", got)
+	}
+}
+
+func TestNormalizeThumbURLForSource(t *testing.T) {
+	got := normalizeThumbURLForSource("https://img.example/thumb/856305.jpg?x=1#frag", "1203058")
+	want := "https://img.example/thumb/1203058.jpg"
+	if got != want {
+		t.Fatalf("normalizeThumbURLForSource = %q, want %q", got, want)
+	}
+}
+
+func TestSpider91ListURLForDetail(t *testing.T) {
+	got := spider91ListURLForDetail("https://www.91porn.com/view_video.php?viewkey=abc&page=5&c=furum&viewtype=basic&category=top")
+	want := "https://www.91porn.com/v.php?category=top&page=5&viewtype=basic"
+	if got != want {
+		t.Fatalf("spider91ListURLForDetail = %q, want %q", got, want)
+	}
+	if got := spider91ListURLForDetail("http://127.0.0.1/v.php?viewkey=abc&page=5&viewtype=basic&category=top"); got != "" {
+		t.Fatalf("spider91ListURLForDetail(localhost) = %q, want empty", got)
+	}
+}
+
+func TestSpider91CookieHeader(t *testing.T) {
+	got := spider91CookieHeader([]*http.Cookie{
+		{Name: "CLIPSHARE", Value: "abc"},
+		{Name: "ga", Value: "def"},
+		{Name: "mode", Value: "m"},
+	})
+	want := "mode=d; CLIPSHARE=abc; ga=def"
+	if got != want {
+		t.Fatalf("spider91CookieHeader = %q, want %q", got, want)
+	}
+}
+
+func spider91DetailHTML(videoURL string) string {
+	fragment := `<video><source src="` + videoURL + `" type="video/mp4"></video>`
+	return `document.write(strencode2("` + url.PathEscape(fragment) + `"));`
+}
+
+func seedCrawlerTestDeps(t *testing.T, tmp, driveID string, entries []map[string]string) (*catalog.Catalog, *Driver, string) {
+	t.Helper()
+	scriptPath := filepath.Join(tmp, driveID+"-fake.sh")
+	if err := os.WriteFile(scriptPath, []byte(buildFakeSpiderScript(entries)), 0o755); err != nil {
+		t.Fatalf("write script: %v", err)
+	}
+	cat, err := catalog.Open(filepath.Join(tmp, driveID+".db"))
+	if err != nil {
+		t.Fatalf("catalog: %v", err)
+	}
+	t.Cleanup(func() {
+		if err := cat.Close(); err != nil {
+			t.Fatalf("close catalog: %v", err)
+		}
+	})
+	drv := New(Config{ID: driveID, RootDir: filepath.Join(tmp, "spider91", driveID)})
+	if err := cat.UpsertDrive(context.Background(), &catalog.Drive{
+		ID: driveID, Kind: Kind, Name: driveID,
+	}); err != nil {
+		t.Fatalf("upsert drive: %v", err)
+	}
+	return cat, drv, scriptPath
+}
+
 // buildFakeSpiderScript 生成一个伪 python 脚本（其实是 sh）。
 //
 // 行为：
@@ -4,7 +4,7 @@
 // 与其它 drive 不同的是：
 //   - 数据来源不是云盘 API，而是 Python 子进程跑 spider_91porn.py 后下载到本地
 //   - StreamURL 直接返回本地文件路径，由 api.handleSpider91Video 用 http.ServeFile 服务
-//   - List/Stat 用于 GC 兜底（按 viewkey 列出 videos/ 目录）
+//   - List/Stat 用于 GC 兜底（按本地文件名列出 videos/ 目录）
 package spider91

 import (
@@ -77,12 +77,12 @@ func (d *Driver) ThumbsDir() string { return filepath.Join(d.rootDir, "thumbs")
 // RootDir 返回 driver 的存储根。
 func (d *Driver) RootDir() string { return d.rootDir }

-// VideoPath 返回某个 viewkey 对应的视频文件绝对路径，并校验路径不会逃出 videos/ 目录。
+// VideoPath 返回某个视频文件的绝对路径，并校验路径不会逃出 videos/ 目录。
 func (d *Driver) VideoPath(fileID string) (string, error) {
 	return safeJoin(d.VideosDir(), fileID)
 }

-// ThumbPath 返回某个 viewkey 对应的封面文件绝对路径。
+// ThumbPath 返回某个封面文件的绝对路径。
 func (d *Driver) ThumbPath(fileID string) (string, error) {
 	return safeJoin(d.ThumbsDir(), fileID)
 }
@@ -117,7 +117,7 @@ func (d *Driver) List(ctx context.Context, dirID string) ([]drives.Entry, error)
 	return out, nil
 }

-// Stat 查询单个 viewkey 视频文件的元数据。
+// Stat 查询单个视频文件的元数据。
 func (d *Driver) Stat(ctx context.Context, fileID string) (*drives.Entry, error) {
 	path, err := d.VideoPath(fileID)
 	if err != nil {