fix: harden spider91 source matching

This commit is contained in:
nianzhibai
2026-05-28 16:10:20 +08:00
parent 7540371838
commit d2d4db8062
8 changed files with 904 additions and 165 deletions
+150 -63
View File
@@ -24,7 +24,7 @@
CLI 参数:
--page N 只爬第 N 页,配合 --output 用于手动调试
--target-new N 从 page 1 起翻页直到凑够 N 个新视频(不在 seen 列表里的)
--seen-viewkeys-file FILE 每行一个已知 viewkey,命中即跳过;与 --target-new 配合使用
--seen-viewkeys-file FILE 每行一个已知 viewkey 或 mp4 源 ID,命中即跳过;与 --target-new 配合使用
--output FILE 输出 JSON 路径,覆盖默认的 OUTPUT_FILE
--no-resume 禁用断点续爬(单页/target-new 模式下自动禁用)
--quiet 压缩日志,每条视频只输出一行
@@ -44,6 +44,7 @@ CLI 参数:
"thumb_url": "https://...thumb/xxxx.jpg",
"video_url": "https://...mp43/xxxx.mp4?st=...",
"viewkey": "abc123...",
"source_id": "xxxx",
"detail_url": "https://...view_video.php?viewkey=..."
},
...
@@ -69,7 +70,7 @@ import json
import os
import sys
import html
from urllib.parse import urljoin, unquote
from urllib.parse import urljoin, unquote, urlparse
from datetime import datetime
try:
@@ -153,6 +154,9 @@ class Porn91Spider:
"""
self.session = requests.Session()
self.session.headers.update(HEADERS)
# 91porn 没有固定 mode cookie 时,详情页首次请求可能返回与列表卡片
# 不一致的视频源;固定桌面模式让列表页和详情页解析保持一致。
self.session.cookies.set("mode", "d")
# 解析后的实际配置;优先使用构造参数,回退到模块级配置
self.output_file = output_file if output_file is not None else OUTPUT_FILE
@@ -164,7 +168,7 @@ class Porn91Spider:
self.max_empty_pages = (
MAX_EMPTY_PAGES if max_empty_pages is None else int(max_empty_pages)
)
# target_new 是 backend 触发时的核心模式:累计处理这么多新 viewkey 后退出。
# target_new 是 backend 触发时的核心模式:累计处理这么多新源视频后退出。
self.target_new = target_new if target_new and target_new > 0 else None
self.quiet = bool(quiet)
# stream_output:每解析出一个 video 直链立即输出一行 JSON 到 stdout
@@ -186,7 +190,7 @@ class Porn91Spider:
self.session.mount("http://", adapter)
except ImportError:
pass # urllib3 版本可能较低
self.results = []
self.pages_crawled = 0
self.processed_videos = 0
@@ -194,9 +198,8 @@ class Porn91Spider:
self.failed_videos = 0
self.skip_viewkeys = set()
# backend 通过 --seen-viewkeys-file 传进来一批已入库的 viewkey
# 这里直接注入到 skip_viewkeys,让 _process_video_list 在看到这些 viewkey 时
# 跳过详情页请求,配合 --target-new 实现"凑够 N 个新"的语义。
# backend 通过 --seen-viewkeys-file 传进来一批已入库的历史 ID。
# 兼容旧名:文件里可能是 viewkey,也可能是新逻辑使用的 mp4 源 ID。
if seen_viewkeys:
for vk in seen_viewkeys:
if not vk:
@@ -204,7 +207,7 @@ class Porn91Spider:
vk = vk.strip()
if vk:
self.skip_viewkeys.add(vk)
# 断点续爬:加载已有结果,跳过已处理的 viewkey
if self.resume and os.path.exists(self.output_file):
try:
@@ -257,12 +260,12 @@ class Porn91Spider:
headers_extra = {}
if referer:
headers_extra["Referer"] = referer
for attempt in range(1, MAX_RETRIES + 1):
try:
self.log(f"正在请求: {description or url} (尝试 {attempt}/{MAX_RETRIES})")
response = self.session.get(url, timeout=30, headers=headers_extra)
# 检查是否被Cloudflare拦截 (需在 raise_for_status 之前)
if response.status_code == 403:
self.log("警告: 收到 403 Forbidden,可能被拦截")
@@ -270,19 +273,19 @@ class Porn91Spider:
self.random_sleep(RETRY_DELAY, RETRY_DELAY + 3)
continue
return ""
response.raise_for_status()
# 优先使用 content.decode('utf-8'),避免 requests 编码检测问题
try:
html_content = response.content.decode('utf-8', errors='replace')
except Exception:
html_content = response.text
# Cloudflare 挑战检测:如果页面主要内容只有挑战页面,而非正常内容
# 注意:网站本身会加载 challenge-platform 脚本,所以不能仅凭此判断
is_cf_challenge = (
"Just a moment" in html_content and
"Just a moment" in html_content and
len(html_content) < 8000
)
if is_cf_challenge:
@@ -291,7 +294,7 @@ class Porn91Spider:
self.random_sleep(RETRY_DELAY, RETRY_DELAY + 5)
continue
return ""
return html_content
except requests.exceptions.HTTPError as e:
self.log(f"HTTP错误: {e}")
@@ -315,50 +318,61 @@ class Porn91Spider:
"""
videos = []
soup = BeautifulSoup(html, 'lxml')
# 查找所有视频链接 (每个视频会出现两次: wwtrx 和 awwtrx 的 c 参数不同)
# 通过 seen_viewkeys 去重
video_links = soup.find_all('a', href=re.compile(r'view_video\.php\?viewkey='))
seen_viewkeys = set()
for link in video_links:
# 只解析正常视频卡片。页面中还混有 col-lg-8 的异常大卡片,里面的标题、
# thumb、detail URL 会串到其它视频,不能作为入库来源。
video_cards = soup.select('div.col-xs-12.col-sm-4.col-md-3.col-lg-3')
seen_cards = set()
for card in video_cards:
link = card.find('a', href=re.compile(r'view_video\.php\?viewkey='))
if not link:
continue
href = link.get('href', '')
if not href:
continue
# 提取 viewkey
match = re.search(r'viewkey=([^&]+)', href)
if not match:
continue
viewkey = match.group(1)
if viewkey in seen_viewkeys:
continue
seen_viewkeys.add(viewkey)
detail_url = urljoin(BASE_URL, href)
# 提取标题
title = self._extract_title(link)
# 提取封面图URL
# 提取列表卡片来源 ID 和封面图 URL
thumb_url = ""
source_id = ""
overlay = link.find(id=re.compile(r'^playvthumb_\d+$'))
if overlay:
source_id = overlay.get('id', '').rsplit('_', 1)[-1]
img = link.find('img', class_=re.compile(r'img-responsive'))
if img:
thumb_url = img.get('src', '') or img.get('data-original', '')
if thumb_url:
thumb_url = urljoin(BASE_URL, thumb_url)
if not source_id and thumb_url:
source_id = self._extract_thumb_source_id(thumb_url)
card_key = source_id or detail_url
if card_key in seen_cards:
continue
seen_cards.add(card_key)
videos.append({
"title": title,
"detail_url": detail_url,
"thumb_url": thumb_url,
"viewkey": viewkey
"viewkey": viewkey,
"source_id": source_id
})
return videos
def _extract_title(self, link) -> str:
"""
从视频链接标签中提取并清理标题
@@ -369,12 +383,12 @@ class Porn91Spider:
title = title_el.get_text(strip=True)
if title:
return html.unescape(title)
# 备用: 从 link 的 title 属性提取
title = link.get('title', '').strip()
if title:
return html.unescape(title)
# 最后手段: 从链接文本提取并清理前缀
text = link.get_text(separator=' ', strip=True)
# 去掉前缀: "HD" / "91" / 时间戳 "HH:MM:SS"
@@ -385,13 +399,17 @@ class Porn91Spider:
def parse_detail_page(self, html: str) -> dict:
"""
解析详情页,提取视频直链
返回: {"video_url": "..."} 或空字典
返回: {"video_url": "...", "source_id": "...", "title": "..."} 或空字典
"""
result = {}
if not html:
return result
title = self._extract_detail_title(html)
if title:
result["title"] = title
# 方法1: 解码 strencode2 (主要方式, 页面通过 document.write 动态写入 video 标签)
# 格式: document.write(strencode2("%3c%73%6f..."));
strencode_match = re.search(r'strencode2\(["\']([^"\']+)["\']\)', html)
@@ -400,7 +418,7 @@ class Porn91Spider:
try:
# strencode2 在JS中等价于 unescape / decodeURIComponent
decoded = unquote(encoded)
# 从解码后的 HTML 片段中提取 src
src_match = re.search(r"src=['\"]([^'\"]+)['\"]", decoded)
if src_match:
@@ -408,10 +426,11 @@ class Porn91Spider:
# 规范化双斜杠 (如 https://host//path -> https://host/path)
video_url = re.sub(r'(https?://[^/]+)//+', r'\1/', video_url)
result["video_url"] = video_url
result["source_id"] = self._extract_source_id(video_url)
return result
except Exception as e:
self.log(f" 解码 strencode2 失败: {e}")
# 方法2: 通用正则匹配页面中的 mp4 链接 (备用, 过滤广告)
mp4_match = re.search(
r'https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*',
@@ -421,10 +440,53 @@ class Porn91Spider:
url = mp4_match.group(0)
if 'kwai' not in url and 'ad-' not in url.lower():
result["video_url"] = url
result["source_id"] = self._extract_source_id(url)
return result
return result
def _extract_detail_title(self, html_text: str) -> str:
soup = BeautifulSoup(html_text, 'lxml')
title_el = soup.find('title')
if not title_el:
return ""
title = title_el.get_text(" ", strip=True)
title = re.sub(r'\s*-\s*91porn.*$', '', title, flags=re.IGNORECASE).strip()
return html.unescape(title)[:160]
def _extract_source_id(self, video_url: str) -> str:
path = urlparse(video_url or "").path
name = os.path.basename(path)
stem, ext = os.path.splitext(name)
if ext.lower() not in {".mp4", ".m4v", ".mov", ".webm", ".mkv", ".avi"}:
return ""
source_id = re.sub(r'[^0-9]+', '', stem)
if not source_id or source_id != stem:
return ""
return source_id
def _extract_thumb_source_id(self, thumb_url: str) -> str:
path = urlparse(thumb_url or "").path
match = re.search(r'/thumb/(\d+)\.[A-Za-z0-9]+$', path)
return match.group(1) if match else ""
def _thumb_url_for_source(self, thumb_url: str, source_id: str) -> str:
if not thumb_url or not source_id:
return thumb_url
parsed = urlparse(thumb_url)
match = re.search(r'/thumb/([^/?#]+)\.[A-Za-z0-9]+$', parsed.path)
if not match:
return thumb_url
current = match.group(1)
if current == source_id:
return thumb_url
path = re.sub(
r'/thumb/[^/?#]+\.[A-Za-z0-9]+$',
f'/thumb/{source_id}.jpg',
parsed.path,
)
return parsed._replace(path=path, query="", fragment="").geturl()
def crawl(self):
"""
主爬取流程。停止条件(任一满足即停):
@@ -468,19 +530,19 @@ class Porn91Spider:
if crawled_in_session > 0:
self.log("")
self.random_sleep(MIN_PAGE_DELAY, MAX_PAGE_DELAY)
self.log(f"[页 {page_num}] 请求: {page_url}")
page_html = self.fetch_page(page_url, f"列表页 第{page_num}")
if not page_html:
self.log(f"[页 {page_num}] 获取失败,跳过")
consecutive_empty += 1
page_num += 1
crawled_in_session += 1
continue
page_videos = self.parse_list_page(page_html)
# 判断页面是否真的没有视频(而非全部已处理)
if not page_videos:
self.log(f"[页 {page_num}] 页面无视频,可能已到末尾")
@@ -488,24 +550,24 @@ class Porn91Spider:
page_num += 1
crawled_in_session += 1
continue
consecutive_empty = 0
# 过滤已处理的 viewkey,只保留新视频
new_videos = [v for v in page_videos if v['viewkey'] not in self.skip_viewkeys]
skipped_on_page = len(page_videos) - len(new_videos)
if skipped_on_page > 0:
self.log(f"[页 {page_num}] 发现 {len(page_videos)} 个链接, 其中 {skipped_on_page} 个已处理, {len(new_videos)} 个新视频")
else:
self.log(f"[页 {page_num}] 发现 {len(new_videos)} 个视频")
if new_videos:
self._process_video_list(new_videos, referer=page_url)
self.pages_crawled += 1
page_num += 1
crawled_in_session += 1
self._save_results()
self._print_summary()
@@ -522,16 +584,16 @@ class Porn91Spider:
self.log(f" [SKIP] 已处理过: {video['viewkey']}")
self.skipped_videos += 1
continue
self.log(f" 处理视频 {idx}/{len(videos)}: {video['title'][:40]}...")
# 延时控制 (同一批次内第一个视频不延时)
if idx > 1:
self.random_sleep(MIN_DETAIL_DELAY, MAX_DETAIL_DELAY)
# 获取详情页
detail_html = self.fetch_page(video['detail_url'], f"详情页 viewkey={video['viewkey']}", referer=referer)
if not detail_html:
self.log(f" [FAIL] 详情页获取失败: {video['viewkey']}")
video["video_url"] = ""
@@ -539,14 +601,39 @@ class Porn91Spider:
self.skip_viewkeys.add(video['viewkey'])
self.failed_videos += 1
continue
# 解析视频直链
detail_info = self.parse_detail_page(detail_html)
if detail_info.get("video_url"):
video["video_url"] = detail_info["video_url"]
if detail_info.get("title"):
video["title"] = detail_info["title"]
list_source_id = video.get("source_id", "")
detail_source_id = detail_info.get("source_id", "")
if list_source_id and detail_source_id and list_source_id != detail_source_id:
self.log(
f" [FAIL] 详情页视频源不匹配: list_source_id={list_source_id} "
f"detail_source_id={detail_source_id} viewkey={video['viewkey']}"
)
self.failed_videos += 1
self.skip_viewkeys.add(video['viewkey'])
continue
if not list_source_id and detail_source_id:
video["source_id"] = detail_source_id
if video.get("source_id"):
video["thumb_url"] = self._thumb_url_for_source(
video.get("thumb_url", ""),
video["source_id"],
)
if video["source_id"] in self.skip_viewkeys:
self.log(f" [SKIP] 已处理过 source_id: {video['source_id']}")
self.skipped_videos += 1
continue
self.results.append(video)
self.skip_viewkeys.add(video['viewkey'])
if video.get("source_id"):
self.skip_viewkeys.add(video["source_id"])
self.processed_videos += 1
self.log(f" [OK] 成功提取视频直链")
# 流式:立刻把这条 entry 交给 Go 端开始下载,不等本批余下视频
@@ -572,7 +659,7 @@ class Porn91Spider:
"failed": self.failed_videos,
"videos": self.results
}
try:
# 保证父目录存在;写入临时文件后原子 rename,避免读到半截 JSON
out_path = self.output_file
@@ -662,9 +749,9 @@ def main():
parser.add_argument("--quiet", action="store_true",
help="压缩日志,每条视频只输出关键事件")
parser.add_argument("--target-new", type=int, default=None,
help="目标新增模式:从 page 1 起翻页直到累计处理这么多新 viewkey 后停止(backend 凌晨任务用)")
help="目标新增模式:从 page 1 起翻页直到累计处理这么多新源视频后停止(backend 凌晨任务用)")
parser.add_argument("--seen-viewkeys-file", type=str, default=None,
help="文件路径,每行一个已处理过的 viewkey;脚本会跳过这些视频不再请求详情页")
help="文件路径,每行一个已处理过的 viewkey 或 mp4 源 ID;脚本会跳过这些视频")
parser.add_argument("--stream-output", action="store_true",
help="流式模式:每解析一条视频直链就立即把它作为一行 JSON 写到 stdout 并 flush"
"日志改走 stderr。配合 backend 边读边下载使用。")
@@ -678,7 +765,7 @@ def main():
按 Ctrl+C 可随时中断并保存进度
""")
# 加载已知 viewkey(来自 backend 的 catalog 已入库列表)
# 加载已知 ID(来自 backend 的 catalog 已入库列表;兼容旧参数名
seen_viewkeys = []
if args.seen_viewkeys_file:
try:
+9 -2
View File
@@ -2,6 +2,8 @@ package main
import (
"context"
"database/sql"
"errors"
"fmt"
"log"
"net/http"
@@ -279,11 +281,16 @@ type App struct {
// 决定。任何"是否入队 preview worker"的判断都应通过这个方法读,避免把状态
// 散落到 App 内存里和 DB 不一致。
//
// 读 catalog 失败时退化成 false(不生成):比 "默认开" 更安全 —— 读不到状态时
// 倾向不消耗 ffmpeg;调用方会记日志,运维能立刻看到问题。
// local-upload 是内置盘,不一定有 catalog.drives 行;缺省按开启处理。
//
// 其它 drive 读 catalog 失败时退化成 false(不生成):比 "默认开" 更安全 —— 读不到
// 状态时倾向不消耗 ffmpeg;调用方会记日志,运维能立刻看到问题。
func (a *App) teaserEnabledForDrive(ctx context.Context, driveID string) bool {
d, err := a.cat.GetDrive(ctx, driveID)
if err != nil {
if driveID == localupload.DriveID && errors.Is(err, sql.ErrNoRows) {
return true
}
log.Printf("[preview] read teaser_enabled drive=%s: %v (treating as disabled)", driveID, err)
return false
}
+27 -21
View File
@@ -46,9 +46,9 @@ func TestRegisterPreviewWorkerBackfillsPendingWhenDriveTeaserEnabled(t *testing.
}
app := &App{
cat: cat,
workers: make(map[string]*preview.Worker),
thumbWorkers: make(map[string]*preview.ThumbWorker),
cat: cat,
workers: make(map[string]*preview.Worker),
thumbWorkers: make(map[string]*preview.ThumbWorker),
}
worker := preview.NewWorker(&serverFakeTeaserGenerator{}, cat, &serverFakeDrive{})
go worker.Run(ctx)
@@ -106,9 +106,9 @@ func TestRegisterPreviewWorkersGenerateThumbnailsBeforePreviews(t *testing.T) {
}
app := &App{
cat: cat,
workers: make(map[string]*preview.Worker),
thumbWorkers: make(map[string]*preview.ThumbWorker),
cat: cat,
workers: make(map[string]*preview.Worker),
thumbWorkers: make(map[string]*preview.ThumbWorker),
}
gen := &serverFakeTeaserGenerator{}
drv := &serverFakeDrive{}
@@ -194,9 +194,9 @@ func TestFailedThumbnailsDoNotBlockPreviewGeneration(t *testing.T) {
}
app := &App{
cat: cat,
workers: make(map[string]*preview.Worker),
thumbWorkers: make(map[string]*preview.ThumbWorker),
cat: cat,
workers: make(map[string]*preview.Worker),
thumbWorkers: make(map[string]*preview.ThumbWorker),
}
gen := &serverFakeTeaserGenerator{}
drv := &serverFakeDrive{}
@@ -261,9 +261,9 @@ func TestRegenFailedPreviewsQueuesOnlyFailedVideosForDrive(t *testing.T) {
}
app := &App{
cat: cat,
workers: make(map[string]*preview.Worker),
thumbWorkers: make(map[string]*preview.ThumbWorker),
cat: cat,
workers: make(map[string]*preview.Worker),
thumbWorkers: make(map[string]*preview.ThumbWorker),
}
worker := preview.NewWorker(&serverFakeTeaserGenerator{}, cat, &serverFakeDrive{})
go worker.Run(ctx)
@@ -311,7 +311,7 @@ func TestRegenFailedPreviewsQueuesOnlyFailedVideosForDrive(t *testing.T) {
}
}
func TestEnqueueUploadedVideoQueuesLocalPreviewWorker(t *testing.T) {
func TestEnqueueUploadedVideoQueuesLocalGenerationByDefault(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
@@ -325,7 +325,6 @@ func TestEnqueueUploadedVideoQueuesLocalPreviewWorker(t *testing.T) {
}
})
seedDriveWithTeaser(t, cat, "local-upload", true)
video := &catalog.Video{
ID: "local-upload-video",
DriveID: "local-upload",
@@ -341,14 +340,19 @@ func TestEnqueueUploadedVideoQueuesLocalPreviewWorker(t *testing.T) {
}
app := &App{
cat: cat,
workers: make(map[string]*preview.Worker),
thumbWorkers: make(map[string]*preview.ThumbWorker),
cat: cat,
workers: make(map[string]*preview.Worker),
thumbWorkers: make(map[string]*preview.ThumbWorker),
}
worker := preview.NewWorker(&serverFakeTeaserGenerator{}, cat, &serverLocalUploadFakeDrive{})
gen := &serverFakeTeaserGenerator{}
drv := &serverLocalUploadFakeDrive{}
worker := preview.NewWorker(gen, cat, drv)
thumbWorker := preview.NewThumbWorker(gen, cat, drv)
go worker.Run(ctx)
go thumbWorker.Run(ctx)
app.mu.Lock()
app.workers["local-upload"] = worker
app.thumbWorkers["local-upload"] = thumbWorker
app.mu.Unlock()
app.enqueueUploadedVideo(ctx, video)
@@ -359,10 +363,13 @@ func TestEnqueueUploadedVideoQueuesLocalPreviewWorker(t *testing.T) {
if err != nil {
t.Fatalf("get video: %v", err)
}
if got.PreviewStatus == "ready" {
if got.PreviewStatus == "ready" && got.ThumbnailURL != "" {
if got.PreviewLocal != "/tmp/local-upload-video.mp4" {
t.Fatalf("preview local = %q, want generated local teaser path", got.PreviewLocal)
}
if got.ThumbnailURL != "/p/thumb/local-upload-video" {
t.Fatalf("thumbnail url = %q, want generated thumbnail URL", got.ThumbnailURL)
}
return
}
time.Sleep(10 * time.Millisecond)
@@ -372,7 +379,7 @@ func TestEnqueueUploadedVideoQueuesLocalPreviewWorker(t *testing.T) {
if err != nil {
t.Fatalf("get video after timeout: %v", err)
}
t.Fatalf("preview status = %q, want ready", got.PreviewStatus)
t.Fatalf("preview status = %q, thumbnail url = %q; want generated local teaser and thumbnail", got.PreviewStatus, got.ThumbnailURL)
}
func TestShouldScanDriveSkipsLocalUpload(t *testing.T) {
@@ -543,7 +550,6 @@ type serverLocalUploadFakeDrive struct {
func (d *serverLocalUploadFakeDrive) ID() string { return "local-upload" }
// seedDriveWithTeaser 在 catalog 里 upsert 一个测试用的 drive 行,把 TeaserEnabled
// 设为 enabled。teaser 入队判断现在按 per-drive 而不是全局 setting,所以涉及到
// teaser worker 的测试都要先把 drive 行写进 catalog。
+2 -2
View File
@@ -634,8 +634,8 @@ func (s *Server) handleUploadedVideo(w http.ResponseWriter, r *http.Request) {
}
// handleSpider91Video 服务 spider91 drive 下载到本地的视频文件。
// 路径形如 /p/spider91/<videoID>videoID = "spider91-<driveID>-<viewkey>"。
// 通过 catalog 拿到 file_id"<viewkey>.mp4"),再让 driver 解析到绝对路径并 ServeFile。
// 路径形如 /p/spider91/<videoID>videoID = "spider91-<driveID>-<sourceID>"。
// 通过 catalog 拿到 file_id"<sourceID>.mp4"),再让 driver 解析到绝对路径并 ServeFile。
func (s *Server) handleSpider91Video(w http.ResponseWriter, r *http.Request) {
videoID := chi.URLParam(r, "videoID")
v, err := s.Catalog.GetVideo(r.Context(), videoID)
+7 -6
View File
@@ -172,7 +172,7 @@ func (c *Catalog) HideVideo(ctx context.Context, id string) error {
// MigrateVideoToDrive 把 catalog 里 id=videoID 这条视频迁移到另一个 drive。
// 用于 spider91 → PikPak 的迁移:上传成功后改写 drive_id / file_id /
// content_hash,保留视频自身的 idspider91-<driveID>-<viewkey>),这样
// content_hash,保留视频自身的 idspider91-<driveID>-<sourceID>),这样
// 关联表 (video_tags / 收藏 / 点赞) 都不需要动。
//
// scanner 后续看到 PikPak 目录下相同 hash / file_name 的文件时,会通过
@@ -555,7 +555,7 @@ func (c *Catalog) ListVideosByDrive(ctx context.Context, driveID string) ([]*Vid
}
// ListVideoFileIDsByDrive 只返回某 drive 下所有视频的 file_id 集合,
// 比 ListVideosByDrive 轻量spider91 crawler 用它把已知 viewkey 列表喂给 python 脚本
// 比 ListVideosByDrive 轻量。
func (c *Catalog) ListVideoFileIDsByDrive(ctx context.Context, driveID string) ([]string, error) {
rows, err := c.db.QueryContext(ctx,
`SELECT file_id FROM videos WHERE drive_id = ? AND file_id != ''`,
@@ -575,16 +575,17 @@ func (c *Catalog) ListVideoFileIDsByDrive(ctx context.Context, driveID string) (
return out, rows.Err()
}
// ListSpider91Viewkeys 列出某个 spider91 drive 历史上爬过的所有 viewkey
// ListSpider91Viewkeys 列出某个 spider91 drive 历史上爬过的所有 ID 后缀
// 函数名保留历史叫法;新 spider91 数据的后缀是 91 mp4 源 ID,不再是 viewkey。
//
// 不能再用 ListVideoFileIDsByDrive:那个只看 drive_id,但 spider91 视频
// 一旦被 spider91migrate 迁移到 PikPakdrive_id 就变成 PikPak 了。
//
// 这里按 video.id 前缀 "spider91-<driveID>-" 查,即使迁移后视频也仍能被
// 找到——id 本身永远是 "spider91-<driveID>-<viewkey>"
// 找到——id 本身会保留 "spider91-<driveID>-<sourceID>" 这个来源前缀
//
// 用途:crawler 把这个集合写到 seen 文件,让 Python 跳过已爬过的 viewkey
// 详情页,配合 --target-new 真正凑出 N 个未爬过的视频。
// 用途:crawler 把这个集合写到 seen 文件,让 Python/Go 跳过已爬过的视频,
// 配合 --target-new 真正凑出 N 个未爬过的视频。
func (c *Catalog) ListSpider91Viewkeys(ctx context.Context, driveID string) ([]string, error) {
prefix := "spider91-" + driveID + "-"
rows, err := c.db.QueryContext(ctx,
+383 -37
View File
@@ -14,6 +14,7 @@ import (
"os/exec"
"path"
"path/filepath"
"regexp"
"strings"
"sync"
"time"
@@ -113,7 +114,7 @@ type CrawlResult struct {
Skipped int
// Failed 是下载或入库失败的条数。
Failed int
// SeenSnapshot 调用 Python 时实际写出的已知 viewkey 数量。
// SeenSnapshot 调用 Python 时实际写出的已知视频 ID 数量。
SeenSnapshot int
StartedAt time.Time
FinishedAt time.Time
@@ -127,11 +128,12 @@ type spiderVideoEntry struct {
ThumbURL string `json:"thumb_url"`
VideoURL string `json:"video_url"`
Viewkey string `json:"viewkey"`
SourceID string `json:"source_id"`
DetailURL string `json:"detail_url"`
}
// RunOnce 执行一次"跑爬虫 → 下载 → 入库"流程:
// 1. 从 catalog 拉取本 drive 已存在的 viewkey 列表,写到临时文件
// 1. 从 catalog 拉取本 drive 已存在的 91 源视频 ID 列表,写到临时文件
// 2. 启动 Python 爬虫(--target-new + --seen-viewkeys-file + --stream-output),
// Python 每解析出一个 video 直链就把 entry 当作一行 JSON 写到 stdout。
// 3. Go 端 bufio.Scanner 按行读:每行立即下载视频和封面、入库。
@@ -168,7 +170,7 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err
result := &CrawlResult{TargetNew: targetNew, StartedAt: time.Now()}
defer func() { result.FinishedAt = time.Now() }()
// 1. 准备 .crawl/ 目录 + 已知 viewkey 列表
// 1. 准备 .crawl/ 目录 + 已知源视频 ID 列表
//
// 关键:路径必须用绝对路径,因为 Python 子进程的 cwd 我们设成了脚本所在目录
// (为了让 Python 用 site-packages 里的 requests 等),传相对路径会被 Python
@@ -222,8 +224,8 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err
continue
}
result.TotalEntries++
viewkey := strings.TrimSpace(item.Viewkey)
if viewkey == "" || strings.TrimSpace(item.VideoURL) == "" {
sourceID := sourceIDForItem(item)
if sourceID == "" || strings.TrimSpace(item.VideoURL) == "" {
result.Failed++
continue
}
@@ -231,13 +233,13 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err
// Python 侧已用 target_new 控制;这里再兜底防止脚本异常多输出
break
}
videoID := buildVideoID(c.cfg.Driver.ID(), viewkey)
videoID := buildVideoID(c.cfg.Driver.ID(), sourceID)
if existing, _ := c.cfg.Catalog.GetVideo(ctx, videoID); existing != nil {
result.Skipped++
continue
}
if perr := c.processOne(ctx, videoID, item); perr != nil {
log.Printf("[spider91] drive=%s viewkey=%s failed: %v", c.cfg.Driver.ID(), viewkey, perr)
log.Printf("[spider91] drive=%s viewkey=%s source_id=%s failed: %v", c.cfg.Driver.ID(), item.Viewkey, sourceID, perr)
result.Failed++
continue
}
@@ -256,24 +258,24 @@ func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, err
return result, nil
}
// writeSeenViewkeys 把当前 drive 下已入库的 viewkey 写到 path,供 Python 脚本读取。
// writeSeenViewkeys 把当前 drive 下已入库的 91 源视频 ID 写到 path,供 Python 脚本读取。
//
// 注意:不能用 ListVideoFileIDsByDrive(按 drive_id 查),因为 spider91
// 视频被 spider91migrate 迁移到 PikPak 后 drive_id 已经不再是这个 drive。
// 改用 ListSpider91Viewkeys:它按 video.id 前缀("spider91-<driveID>-")查,
// 不受迁移影响——id 永远是 spider91-<driveID>-<viewkey>
// 不受迁移影响。函数名保留历史叫法,实际返回的是 ID 后缀;新数据使用 mp4 源 ID
func (c *Crawler) writeSeenViewkeys(ctx context.Context, path string) (int, error) {
viewkeys, err := c.cfg.Catalog.ListSpider91Viewkeys(ctx, c.cfg.Driver.ID())
seenIDs, err := c.cfg.Catalog.ListSpider91Viewkeys(ctx, c.cfg.Driver.ID())
if err != nil {
return 0, err
}
seen := make(map[string]struct{}, len(viewkeys))
for _, vk := range viewkeys {
vk = strings.TrimSpace(vk)
if vk == "" {
seen := make(map[string]struct{}, len(seenIDs))
for _, id := range seenIDs {
id = strings.TrimSpace(id)
if id == "" {
continue
}
seen[vk] = struct{}{}
seen[id] = struct{}{}
}
tmp := path + ".part"
@@ -281,8 +283,8 @@ func (c *Crawler) writeSeenViewkeys(ctx context.Context, path string) (int, erro
if err != nil {
return 0, err
}
for viewkey := range seen {
if _, err := f.WriteString(viewkey + "\n"); err != nil {
for id := range seen {
if _, err := f.WriteString(id + "\n"); err != nil {
_ = f.Close()
_ = os.Remove(tmp)
return 0, err
@@ -356,15 +358,30 @@ func forwardSpiderLog(driveID string, r io.Reader) {
}
}
// processOne 处理单个 viewkey:下载视频 + 封面 + 复制封面 + 入库。
// processOne 处理单个 91 源视频:下载视频 + 封面 + 复制封面 + 入库。
// 任一步失败会清理已写入的临时文件,不留半成品。
func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVideoEntry) error {
viewkey := item.Viewkey
sourceID := sourceIDForItem(item)
if sourceID == "" {
return errors.New("empty numeric source id")
}
videoURL := strings.TrimSpace(item.VideoURL)
videoSourceID := sourceIDFromVideoURL(videoURL)
if videoSourceID == "" {
return fmt.Errorf("video url has no numeric source id: %s", videoURL)
}
if videoSourceID != sourceID {
return fmt.Errorf("video source id mismatch: got %s want %s", videoSourceID, sourceID)
}
thumbURL := normalizeThumbURLForSource(item.ThumbURL, sourceID)
// 视频文件后缀按直链 URL 真实后缀来定,避免直链返回的不是 mp4 时存错容器。
videoExt := detectVideoExt(item.VideoURL)
videoFile := viewkey + videoExt
videoExt := detectVideoExt(videoURL)
videoFile := sourceID + videoExt
// 封面后缀同理,但 91porn 的封面绝大多数是 jpg;URL 提示其它格式时尊重之。
thumbFile := viewkey + detectThumbExt(item.ThumbURL)
thumbFile := sourceID + detectThumbExt(thumbURL)
videoPath, err := c.cfg.Driver.VideoPath(videoFile)
if err != nil {
@@ -376,10 +393,7 @@ func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVid
}
// 视频先下载(必须);失败直接退出。
dlCtx, cancel := context.WithTimeout(ctx, c.cfg.DownloadTimeout)
defer cancel()
videoSize, err := c.downloadAtomic(dlCtx, item.VideoURL, videoPath, item.DetailURL)
videoSize, err := c.downloadVideoAtomicWithRefresh(ctx, item, videoPath, videoURL, sourceID)
if err != nil {
return fmt.Errorf("download video: %w", err)
}
@@ -388,9 +402,12 @@ func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVid
// thumbnail_status 显式标 'failed'spider91 drive 的 thumb worker 按设计
// 不处理 spider91 视频,没人能"兜底")。
thumbReady := false
if strings.TrimSpace(item.ThumbURL) != "" {
if _, err := c.downloadAtomic(dlCtx, item.ThumbURL, thumbPath, item.DetailURL); err != nil {
log.Printf("[spider91] drive=%s viewkey=%s thumb download failed: %v", c.cfg.Driver.ID(), viewkey, err)
if strings.TrimSpace(thumbURL) != "" {
thumbCtx, cancel := c.downloadAttemptContext(ctx)
_, err := c.downloadAtomic(thumbCtx, thumbURL, thumbPath, item.DetailURL)
cancel()
if err != nil {
log.Printf("[spider91] drive=%s viewkey=%s source_id=%s thumb download failed: %v", c.cfg.Driver.ID(), viewkey, sourceID, err)
} else {
thumbReady = true
}
@@ -404,7 +421,7 @@ func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVid
} else {
dst := filepath.Join(c.cfg.CommonThumbDir, videoID+".jpg")
if err := copyFileAtomic(thumbPath, dst); err != nil {
log.Printf("[spider91] drive=%s viewkey=%s copy thumb to common dir: %v", c.cfg.Driver.ID(), viewkey, err)
log.Printf("[spider91] drive=%s viewkey=%s source_id=%s copy thumb to common dir: %v", c.cfg.Driver.ID(), viewkey, sourceID, err)
thumbReady = false
}
}
@@ -429,7 +446,7 @@ func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVid
UpdatedAt: now,
}
if v.Title == "" {
v.Title = viewkey
v.Title = sourceID
}
if thumbReady {
// 设了 ThumbnailURL 后 thumb worker 会跳过这条视频,
@@ -455,10 +472,56 @@ func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVid
if c.cfg.OnNewVideo != nil {
c.cfg.OnNewVideo(v)
}
log.Printf("[spider91] drive=%s viewkey=%s ok title=%q size=%d", c.cfg.Driver.ID(), viewkey, v.Title, v.Size)
log.Printf("[spider91] drive=%s viewkey=%s source_id=%s ok title=%q size=%d", c.cfg.Driver.ID(), viewkey, sourceID, v.Title, v.Size)
return nil
}
func (c *Crawler) downloadVideoAtomicWithRefresh(ctx context.Context, item spiderVideoEntry, dst, firstURL, expectedSourceID string) (int64, error) {
videoURL := strings.TrimSpace(firstURL)
if videoURL == "" {
videoURL = strings.TrimSpace(item.VideoURL)
}
var lastErr error
for attempt := 1; attempt <= 3; attempt++ {
attemptCtx, cancel := c.downloadAttemptContext(ctx)
size, err := c.downloadAtomic(attemptCtx, videoURL, dst, item.DetailURL)
cancel()
if err == nil {
return size, nil
}
lastErr = err
if ctx.Err() != nil || !shouldRefreshSpider91VideoURL(err) {
return 0, err
}
fresh, refreshErr := c.resolveFreshVideoURL(ctx, item)
if refreshErr != nil {
return 0, fmt.Errorf("%w; refresh video url: %v", err, refreshErr)
}
if fresh == "" || fresh == videoURL {
return 0, err
}
freshSourceID := sourceIDFromVideoURL(fresh)
if freshSourceID == "" {
return 0, fmt.Errorf("%w; refreshed video url has no numeric source id: %s", err, fresh)
}
if expectedSourceID != "" && freshSourceID != expectedSourceID {
return 0, fmt.Errorf("%w; refreshed video source id mismatch: got %s want %s", err, freshSourceID, expectedSourceID)
}
_ = os.Remove(dst + ".part")
log.Printf("[spider91] drive=%s viewkey=%s source_id=%s download attempt=%d failed (%v); refreshed video url and retrying",
c.cfg.Driver.ID(), item.Viewkey, expectedSourceID, attempt, err)
videoURL = fresh
}
return 0, lastErr
}
func (c *Crawler) downloadAttemptContext(ctx context.Context) (context.Context, context.CancelFunc) {
if c.cfg.DownloadTimeout <= 0 {
return ctx, func() {}
}
return context.WithTimeout(ctx, c.cfg.DownloadTimeout)
}
// downloadAtomic 下载 url 到 dst,先写到 dst.part 再 rename,避免半截文件。
// 返回最终文件大小。
func (c *Crawler) downloadAtomic(ctx context.Context, src, dst, referer string) (int64, error) {
@@ -487,7 +550,7 @@ func (c *Crawler) downloadAtomic(ctx context.Context, src, dst, referer string)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return 0, fmt.Errorf("http %d", resp.StatusCode)
return 0, &downloadHTTPError{StatusCode: resp.StatusCode}
}
tmp := dst + ".part"
@@ -516,6 +579,289 @@ func (c *Crawler) downloadAtomic(ctx context.Context, src, dst, referer string)
return written, nil
}
type downloadHTTPError struct {
StatusCode int
}
func (e *downloadHTTPError) Error() string {
if e == nil {
return "http error"
}
return fmt.Sprintf("http %d", e.StatusCode)
}
func shouldRefreshSpider91VideoURL(err error) bool {
if err == nil {
return false
}
if errors.Is(err, context.Canceled) {
return false
}
if errors.Is(err, context.DeadlineExceeded) {
return true
}
var httpErr *downloadHTTPError
if errors.As(err, &httpErr) {
switch httpErr.StatusCode {
case http.StatusForbidden, http.StatusNotFound, http.StatusGone, http.StatusRequestedRangeNotSatisfiable,
http.StatusTooManyRequests, http.StatusInternalServerError, http.StatusBadGateway, http.StatusServiceUnavailable, http.StatusGatewayTimeout:
return true
default:
return false
}
}
text := strings.ToLower(err.Error())
return strings.Contains(text, "unexpected eof") ||
strings.Contains(text, "connection reset") ||
strings.Contains(text, "connection refused") ||
strings.Contains(text, "broken pipe") ||
strings.Contains(text, "server closed") ||
strings.Contains(text, "timeout")
}
func (c *Crawler) resolveFreshVideoURL(ctx context.Context, item spiderVideoEntry) (string, error) {
detailURL := strings.TrimSpace(item.DetailURL)
if detailURL == "" {
return "", errors.New("empty detail url")
}
cookieHeader := "mode=d"
if warmURL := spider91ListURLForDetail(detailURL); warmURL != "" {
if cookies, err := c.fetchSpider91WarmCookies(ctx, warmURL, detailURL); err == nil {
cookieHeader = spider91CookieHeader(cookies)
} else {
log.Printf("[spider91] drive=%s viewkey=%s warm session failed: %v", c.cfg.Driver.ID(), item.Viewkey, err)
}
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, detailURL, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", downloadUA)
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9")
req.Header.Set("Cookie", cookieHeader)
resp, err := c.cfg.HTTPClient.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return "", &downloadHTTPError{StatusCode: resp.StatusCode}
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 4*1024*1024))
if err != nil {
return "", err
}
videoURL := parseSpider91VideoURL(string(body))
if videoURL == "" {
return "", errors.New("video url not found in detail page")
}
return videoURL, nil
}
func (c *Crawler) fetchSpider91WarmCookies(ctx context.Context, warmURL, referer string) ([]*http.Cookie, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, warmURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", downloadUA)
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9")
req.Header.Set("Cookie", "mode=d")
if referer != "" {
req.Header.Set("Referer", referer)
}
resp, err := c.cfg.HTTPClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
_, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 64*1024))
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return nil, &downloadHTTPError{StatusCode: resp.StatusCode}
}
return resp.Cookies(), nil
}
func spider91ListURLForDetail(detailURL string) string {
u, err := url.Parse(strings.TrimSpace(detailURL))
if err != nil || u == nil || u.Scheme == "" || u.Host == "" {
return ""
}
if !strings.Contains(strings.ToLower(u.Host), "91porn.com") {
return ""
}
q := u.Query()
page := strings.TrimSpace(q.Get("page"))
category := strings.TrimSpace(q.Get("category"))
viewtype := strings.TrimSpace(q.Get("viewtype"))
if page == "" || category == "" || viewtype == "" {
return ""
}
listURL := *u
listURL.Path = "/v.php"
listQuery := url.Values{}
listQuery.Set("category", category)
listQuery.Set("viewtype", viewtype)
listQuery.Set("page", page)
listURL.RawQuery = listQuery.Encode()
listURL.Fragment = ""
return listURL.String()
}
func spider91CookieHeader(cookies []*http.Cookie) string {
values := []string{"mode=d"}
seen := map[string]bool{"mode": true}
for _, cookie := range cookies {
if cookie == nil || strings.TrimSpace(cookie.Name) == "" || seen[cookie.Name] {
continue
}
seen[cookie.Name] = true
values = append(values, cookie.Name+"="+cookie.Value)
}
return strings.Join(values, "; ")
}
var (
strencode2RE = regexp.MustCompile(`strencode2\(["']([^"']+)["']\)`)
srcAttrRE = regexp.MustCompile(`src=['"]([^'"]+)['"]`)
mp4URLRE = regexp.MustCompile(`https?://[^\s"'<>]+\.mp4[^\s"'<>]*`)
)
func parseSpider91VideoURL(html string) string {
if html == "" {
return ""
}
if match := strencode2RE.FindStringSubmatch(html); len(match) == 2 {
if decoded, err := url.PathUnescape(match[1]); err == nil {
if src := srcAttrRE.FindStringSubmatch(decoded); len(src) == 2 {
return normalizeHTTPURLSlashes(src[1])
}
}
}
if match := mp4URLRE.FindString(html); match != "" {
lower := strings.ToLower(match)
if !strings.Contains(lower, "kwai") && !strings.Contains(lower, "ad-") {
return match
}
}
return ""
}
func normalizeHTTPURLSlashes(rawURL string) string {
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || u == nil || u.Scheme == "" || u.Host == "" {
return rawURL
}
for strings.Contains(u.Path, "//") {
u.Path = strings.ReplaceAll(u.Path, "//", "/")
}
return u.String()
}
func sourceIDForItem(item spiderVideoEntry) string {
if id := sanitizeSourceID(item.SourceID); isNumericSourceID(id) {
return id
}
if id := sourceIDFromVideoURL(item.VideoURL); id != "" {
return id
}
if id := sourceIDFromThumbURL(item.ThumbURL); id != "" {
return id
}
return ""
}
func sourceIDFromVideoURL(rawURL string) string {
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || u == nil {
return ""
}
base := path.Base(u.Path)
ext := strings.ToLower(path.Ext(base))
switch ext {
case ".mp4", ".m4v", ".mov", ".webm", ".mkv", ".avi", ".flv":
default:
return ""
}
id := sanitizeSourceID(strings.TrimSuffix(base, ext))
if !isNumericSourceID(id) {
return ""
}
return id
}
func sourceIDFromThumbURL(rawURL string) string {
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || u == nil {
return ""
}
base := path.Base(u.Path)
ext := strings.ToLower(path.Ext(base))
switch ext {
case ".jpg", ".jpeg", ".png", ".webp", ".gif":
default:
return ""
}
id := sanitizeSourceID(strings.TrimSuffix(base, ext))
if !isNumericSourceID(id) {
return ""
}
return id
}
func sanitizeSourceID(raw string) string {
raw = strings.TrimSpace(raw)
if raw == "" {
return ""
}
var b strings.Builder
for _, r := range raw {
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' || r == '-' {
b.WriteRune(r)
}
}
return b.String()
}
func isNumericSourceID(id string) bool {
if id == "" {
return false
}
for _, r := range id {
if r < '0' || r > '9' {
return false
}
}
return true
}
func normalizeThumbURLForSource(rawURL, sourceID string) string {
sourceID = sanitizeSourceID(sourceID)
if strings.TrimSpace(rawURL) == "" || sourceID == "" {
return rawURL
}
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || u == nil || u.Scheme == "" || u.Host == "" {
return rawURL
}
base := path.Base(u.Path)
ext := strings.ToLower(path.Ext(base))
switch ext {
case ".jpg", ".jpeg", ".png", ".webp", ".gif":
default:
return rawURL
}
dir := path.Dir(u.Path)
if dir == "." || dir == "/" || !strings.HasSuffix(dir, "/thumb") {
return rawURL
}
u.Path = path.Join(dir, sourceID+".jpg")
u.RawQuery = ""
u.Fragment = ""
return u.String()
}
// copyFileAtomic 把 src 复制到 dst,先写 .part 再 rename。
func copyFileAtomic(src, dst string) error {
in, err := os.Open(src)
@@ -543,14 +889,14 @@ func copyFileAtomic(src, dst string) error {
return os.Rename(tmp, dst)
}
// BuildVideoID 给定 driveID + viewkey,按统一规则生成 catalog 中 videos.id。
// BuildVideoID 给定 driveID + 91 源视频 ID,按统一规则生成 catalog 中 videos.id。
// 与 scanner 用法一致:<kind>-<driveID>-<fileID>。
func BuildVideoID(driveID, viewkey string) string {
return buildVideoID(driveID, viewkey)
func BuildVideoID(driveID, sourceID string) string {
return buildVideoID(driveID, sourceID)
}
func buildVideoID(driveID, viewkey string) string {
return Kind + "-" + driveID + "-" + viewkey
func buildVideoID(driveID, sourceID string) string {
return Kind + "-" + driveID + "-" + sourceID
}
// detectVideoExt 从直链 URL 推断视频文件后缀。
+322 -30
View File
@@ -5,10 +5,12 @@ import (
"encoding/json"
"net/http"
"net/http/httptest"
"net/url"
"os"
"path/filepath"
"runtime"
"strings"
"sync/atomic"
"testing"
"time"
@@ -17,7 +19,7 @@ import (
// TestCrawlerRunOnceFullFlow 用一个伪 python 脚本 + httptest 服务器
// 把 Crawler.RunOnce 的完整流程跑一遍:脚本生成 JSON、下载视频和封面、入库、
// 重复运行跳过已存在的 viewkey
// 重复运行跳过已存在的 91 源视频 ID
func TestCrawlerRunOnceFullFlow(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("shell-based fake script only on unix")
@@ -28,16 +30,16 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
// 1. 假 HTTP 服务器:根据路径返回视频数据或封面数据
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case strings.Contains(r.URL.Path, "video1.mp4"):
case strings.Contains(r.URL.Path, "120001.mp4"):
w.Header().Set("Content-Type", "video/mp4")
_, _ = w.Write([]byte("FAKEVIDEO1"))
case strings.Contains(r.URL.Path, "video2.mp4"):
case strings.Contains(r.URL.Path, "120002.mp4"):
w.Header().Set("Content-Type", "video/mp4")
_, _ = w.Write([]byte("FAKEVIDEO2BYTES"))
case strings.Contains(r.URL.Path, "thumb1.jpg"):
case strings.Contains(r.URL.Path, "/thumb/120001.jpg"):
w.Header().Set("Content-Type", "image/jpeg")
_, _ = w.Write([]byte("\xff\xd8\xff\xe0fakejpg1"))
case strings.Contains(r.URL.Path, "thumb2.jpg"):
case strings.Contains(r.URL.Path, "/thumb/120002.jpg"):
w.Header().Set("Content-Type", "image/jpeg")
_, _ = w.Write([]byte("\xff\xd8\xff\xe0fakejpg2"))
default:
@@ -52,15 +54,15 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
videoEntries := []map[string]string{
{
"title": "Video One",
"thumb_url": srv.URL + "/thumbs/thumb1.jpg",
"video_url": srv.URL + "/videos/video1.mp4",
"thumb_url": srv.URL + "/thumb/not-120001.jpg",
"video_url": srv.URL + "/videos/120001.mp4",
"viewkey": "vk-001",
"detail_url": srv.URL + "/v.php?viewkey=vk-001",
},
{
"title": "Video Two",
"thumb_url": srv.URL + "/thumbs/thumb2.jpg",
"video_url": srv.URL + "/videos/video2.mp4",
"thumb_url": srv.URL + "/thumb/not-120002.jpg",
"video_url": srv.URL + "/videos/120002.mp4",
"viewkey": "vk-002",
"detail_url": srv.URL + "/v.php?viewkey=vk-002",
},
@@ -128,29 +130,28 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
// 5. 检查文件落盘
for _, item := range []struct {
viewkey string
sourceID string
size int64
thumbLen int
}{
{"vk-001", 10, 11},
{"vk-002", 15, 11},
{"120001", 10},
{"120002", 15},
} {
videoPath := filepath.Join(rootDir, "videos", item.viewkey+".mp4")
videoPath := filepath.Join(rootDir, "videos", item.sourceID+".mp4")
info, err := os.Stat(videoPath)
if err != nil {
t.Fatalf("video %s missing: %v", item.viewkey, err)
t.Fatalf("video %s missing: %v", item.sourceID, err)
}
if info.Size() != item.size {
t.Fatalf("video %s size = %d, want %d", item.viewkey, info.Size(), item.size)
t.Fatalf("video %s size = %d, want %d", item.sourceID, info.Size(), item.size)
}
thumbPath := filepath.Join(rootDir, "thumbs", item.viewkey+".jpg")
thumbPath := filepath.Join(rootDir, "thumbs", item.sourceID+".jpg")
if _, err := os.Stat(thumbPath); err != nil {
t.Fatalf("thumb %s missing: %v", item.viewkey, err)
t.Fatalf("thumb %s missing: %v", item.sourceID, err)
}
// 复制到 common thumbs 目录的副本,名字按 videoID 来
videoID := BuildVideoID(driveID, item.viewkey)
videoID := BuildVideoID(driveID, item.sourceID)
commonThumb := filepath.Join(commonThumbs, videoID+".jpg")
if _, err := os.Stat(commonThumb); err != nil {
t.Fatalf("common thumb %s missing: %v", commonThumb, err)
@@ -158,8 +159,8 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
}
// 6. 检查 catalog 入库
for _, viewkey := range []string{"vk-001", "vk-002"} {
videoID := BuildVideoID(driveID, viewkey)
for _, sourceID := range []string{"120001", "120002"} {
videoID := BuildVideoID(driveID, sourceID)
v, err := cat.GetVideo(context.Background(), videoID)
if err != nil {
t.Fatalf("GetVideo %s: %v", videoID, err)
@@ -167,8 +168,8 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
if v.DriveID != driveID {
t.Fatalf("video %s drive_id = %q want %q", videoID, v.DriveID, driveID)
}
if v.FileID != viewkey+".mp4" {
t.Fatalf("video %s file_id = %q want %q", videoID, v.FileID, viewkey+".mp4")
if v.FileID != sourceID+".mp4" {
t.Fatalf("video %s file_id = %q want %q", videoID, v.FileID, sourceID+".mp4")
}
if v.ThumbnailURL == "" {
t.Fatalf("video %s ThumbnailURL empty (cover should be ready)", videoID)
@@ -189,7 +190,7 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
}
}
// 7. 第二次 RunOnceviewkey 已存在 → 全部 skipped,无新文件下载
// 7. 第二次 RunOnce源视频 ID 已存在 → 全部 skipped,无新文件下载
newVideos = nil
res2, err := c.RunOnce(context.Background(), 15)
if err != nil {
@@ -201,7 +202,7 @@ func TestCrawlerRunOnceFullFlow(t *testing.T) {
if res2.Skipped != 2 {
t.Fatalf("second run Skipped = %d, want 2", res2.Skipped)
}
// 第二次运行时 catalog 里已经有 2 条,seen snapshot 应该写出 2 个 viewkey
// 第二次运行时 catalog 里已经有 2 条,seen snapshot 应该写出 2 个源视频 ID
if res2.SeenSnapshot != 2 {
t.Fatalf("second run SeenSnapshot = %d, want 2", res2.SeenSnapshot)
}
@@ -249,10 +250,10 @@ func TestCrawlerThumbDownloadFailureMarksStatusFailed(t *testing.T) {
// 假 HTTP 服务器:thumb 路径返回 500,video 正常返回字节。
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case strings.Contains(r.URL.Path, "video.mp4"):
case strings.Contains(r.URL.Path, "120101.mp4"):
w.Header().Set("Content-Type", "video/mp4")
_, _ = w.Write([]byte("FAKEVIDEO"))
case strings.Contains(r.URL.Path, "thumb.jpg"):
case strings.Contains(r.URL.Path, "120101.jpg"):
http.Error(w, "broken", http.StatusInternalServerError)
default:
http.NotFound(w, r)
@@ -263,8 +264,8 @@ func TestCrawlerThumbDownloadFailureMarksStatusFailed(t *testing.T) {
videoEntries := []map[string]string{
{
"title": "Thumb Failure Video",
"thumb_url": srv.URL + "/thumbs/thumb.jpg",
"video_url": srv.URL + "/videos/video.mp4",
"thumb_url": srv.URL + "/thumb/120101.jpg",
"video_url": srv.URL + "/videos/120101.mp4",
"viewkey": "vk-thumb-fail",
"detail_url": srv.URL + "/v.php?viewkey=vk-thumb-fail",
},
@@ -306,7 +307,7 @@ func TestCrawlerThumbDownloadFailureMarksStatusFailed(t *testing.T) {
t.Fatalf("expected 1 new video, got %d (failed=%d)", res.NewVideos, res.Failed)
}
got, err := cat.GetVideo(context.Background(), "spider91-"+driveID+"-vk-thumb-fail")
got, err := cat.GetVideo(context.Background(), "spider91-"+driveID+"-120101")
if err != nil {
t.Fatalf("get video: %v", err)
}
@@ -327,6 +328,297 @@ func TestCrawlerThumbDownloadFailureMarksStatusFailed(t *testing.T) {
}
}
func TestCrawlerUsesCrawlerVideoURLForFirstDownload(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("shell-based fake script only on unix")
}
tmp := t.TempDir()
var detailRequests int32
var originalRequests int32
var wrongRequests int32
var srv *httptest.Server
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.URL.Path == "/v.php":
atomic.AddInt32(&detailRequests, 1)
_, _ = w.Write([]byte(spider91DetailHTML(srv.URL + "/videos/856305.mp4?token=wrong")))
case r.URL.Path == "/videos/120201.mp4" && r.URL.Query().Get("token") == "original":
atomic.AddInt32(&originalRequests, 1)
w.Header().Set("Content-Type", "video/mp4")
_, _ = w.Write([]byte("ORIGINALVIDEO"))
case r.URL.Path == "/videos/856305.mp4":
atomic.AddInt32(&wrongRequests, 1)
w.Header().Set("Content-Type", "video/mp4")
_, _ = w.Write([]byte("WRONGVIDEO"))
case r.URL.Path == "/thumb/120201.jpg":
w.Header().Set("Content-Type", "image/jpeg")
_, _ = w.Write([]byte("\xff\xd8\xff\xe0thumb"))
default:
http.NotFound(w, r)
}
}))
defer srv.Close()
entry := map[string]string{
"title": "Use Original URL First",
"thumb_url": srv.URL + "/thumb/wrong-thumb.jpg",
"video_url": srv.URL + "/videos/120201.mp4?token=original",
"viewkey": "vk-use-original",
"detail_url": srv.URL + "/v.php?viewkey=vk-use-original",
}
cat, drv, scriptPath := seedCrawlerTestDeps(t, tmp, "use-original-drive", []map[string]string{entry})
c := NewCrawler(CrawlerConfig{
Driver: drv,
Catalog: cat,
PythonPath: "sh",
ScriptPath: scriptPath,
CommonThumbDir: filepath.Join(tmp, "previews", "thumbs"),
SpiderTimeout: 10 * time.Second,
DownloadTimeout: 10 * time.Second,
})
res, err := c.RunOnce(context.Background(), 1)
if err != nil {
t.Fatalf("RunOnce: %v", err)
}
if res.NewVideos != 1 || res.Failed != 0 {
t.Fatalf("result new=%d failed=%d, want 1/0", res.NewVideos, res.Failed)
}
if got := atomic.LoadInt32(&detailRequests); got != 0 {
t.Fatalf("detail requests = %d, want 0 (first download should use crawler URL)", got)
}
if got := atomic.LoadInt32(&originalRequests); got != 1 {
t.Fatalf("original URL requests = %d, want 1", got)
}
if got := atomic.LoadInt32(&wrongRequests); got != 0 {
t.Fatalf("wrong source URL requests = %d, want 0", got)
}
info, err := os.Stat(filepath.Join(drv.RootDir(), "videos", "120201.mp4"))
if err != nil {
t.Fatalf("original video missing: %v", err)
}
if info.Size() != int64(len("ORIGINALVIDEO")) {
t.Fatalf("original video size = %d, want %d", info.Size(), len("ORIGINALVIDEO"))
}
}
func TestCrawlerRefreshesVideoURLAfterExpiredDownload(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("shell-based fake script only on unix")
}
tmp := t.TempDir()
var detailRequests int32
var staleRequests int32
var freshRequests int32
var srv *httptest.Server
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.URL.Path == "/v.php":
n := atomic.AddInt32(&detailRequests, 1)
videoURL := srv.URL + "/videos/120202.mp4?token=stale"
if n > 1 {
videoURL = srv.URL + "/videos/120202.mp4?token=fresh"
}
_, _ = w.Write([]byte(spider91DetailHTML(videoURL)))
case r.URL.Path == "/videos/120202.mp4" && r.URL.Query().Get("token") == "stale":
atomic.AddInt32(&staleRequests, 1)
http.Error(w, "expired", http.StatusForbidden)
case r.URL.Path == "/videos/120202.mp4" && r.URL.Query().Get("token") == "fresh":
atomic.AddInt32(&freshRequests, 1)
w.Header().Set("Content-Type", "video/mp4")
_, _ = w.Write([]byte("REFRESHEDVIDEO"))
case r.URL.Path == "/thumb/120202.jpg":
w.Header().Set("Content-Type", "image/jpeg")
_, _ = w.Write([]byte("\xff\xd8\xff\xe0thumb"))
default:
http.NotFound(w, r)
}
}))
defer srv.Close()
entry := map[string]string{
"title": "Refresh After Expired Download",
"thumb_url": srv.URL + "/thumb/wrong-thumb.jpg",
"video_url": srv.URL + "/videos/120202.mp4?token=old",
"viewkey": "vk-refresh-after",
"detail_url": srv.URL + "/v.php?viewkey=vk-refresh-after",
}
cat, drv, scriptPath := seedCrawlerTestDeps(t, tmp, "refresh-after-drive", []map[string]string{entry})
c := NewCrawler(CrawlerConfig{
Driver: drv,
Catalog: cat,
PythonPath: "sh",
ScriptPath: scriptPath,
CommonThumbDir: filepath.Join(tmp, "previews", "thumbs"),
SpiderTimeout: 10 * time.Second,
DownloadTimeout: 10 * time.Second,
})
res, err := c.RunOnce(context.Background(), 1)
if err != nil {
t.Fatalf("RunOnce: %v", err)
}
if res.NewVideos != 1 || res.Failed != 0 {
t.Fatalf("result new=%d failed=%d, want 1/0", res.NewVideos, res.Failed)
}
if got := atomic.LoadInt32(&detailRequests); got < 2 {
t.Fatalf("detail requests = %d, want at least 2 (initial refresh + retry refresh)", got)
}
if got := atomic.LoadInt32(&staleRequests); got != 1 {
t.Fatalf("stale URL requests = %d, want 1", got)
}
if got := atomic.LoadInt32(&freshRequests); got != 1 {
t.Fatalf("fresh URL requests = %d, want 1", got)
}
info, err := os.Stat(filepath.Join(drv.RootDir(), "videos", "120202.mp4"))
if err != nil {
t.Fatalf("refreshed video missing: %v", err)
}
if info.Size() != int64(len("REFRESHEDVIDEO")) {
t.Fatalf("refreshed video size = %d, want %d", info.Size(), len("REFRESHEDVIDEO"))
}
}
func TestCrawlerRejectsRefreshedSourceIDMismatch(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("shell-based fake script only on unix")
}
tmp := t.TempDir()
var srv *httptest.Server
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.URL.Path == "/v.php":
_, _ = w.Write([]byte(spider91DetailHTML(srv.URL + "/videos/856305.mp4?token=fresh")))
case r.URL.Path == "/videos/1203058.mp4":
http.Error(w, "expired", http.StatusForbidden)
case r.URL.Path == "/videos/856305.mp4":
w.Header().Set("Content-Type", "video/mp4")
_, _ = w.Write([]byte("WRONGVIDEO"))
default:
http.NotFound(w, r)
}
}))
defer srv.Close()
entry := map[string]string{
"title": "Source ID Mismatch",
"thumb_url": srv.URL + "/thumb/1203058.jpg",
"video_url": srv.URL + "/videos/1203058.mp4?token=old",
"viewkey": "86fd91cce1f2e1a154cc",
"source_id": "1203058",
"detail_url": srv.URL + "/v.php?viewkey=86fd91cce1f2e1a154cc",
}
cat, drv, scriptPath := seedCrawlerTestDeps(t, tmp, "mismatch-drive", []map[string]string{entry})
c := NewCrawler(CrawlerConfig{
Driver: drv,
Catalog: cat,
PythonPath: "sh",
ScriptPath: scriptPath,
CommonThumbDir: filepath.Join(tmp, "previews", "thumbs"),
SpiderTimeout: 10 * time.Second,
DownloadTimeout: 10 * time.Second,
})
res, err := c.RunOnce(context.Background(), 1)
if err != nil {
t.Fatalf("RunOnce: %v", err)
}
if res.NewVideos != 0 || res.Failed != 1 {
t.Fatalf("result new=%d failed=%d, want 0/1", res.NewVideos, res.Failed)
}
if _, err := os.Stat(filepath.Join(drv.RootDir(), "videos", "1203058.mp4")); !os.IsNotExist(err) {
t.Fatalf("mismatched source file should not be written, stat err=%v", err)
}
if v, _ := cat.GetVideo(context.Background(), BuildVideoID(drv.ID(), "1203058")); v != nil {
t.Fatalf("mismatched video should not be inserted: %+v", v)
}
}
func TestSourceIDForItemRequiresNumericSourceID(t *testing.T) {
if got := sourceIDForItem(spiderVideoEntry{
Viewkey: "86fd91cce1f2e1a154cc",
VideoURL: "https://cdn.example/videos/1203058.mp4?token=x",
}); got != "1203058" {
t.Fatalf("sourceIDForItem(video url) = %q, want 1203058", got)
}
if got := sourceIDForItem(spiderVideoEntry{
Viewkey: "86fd91cce1f2e1a154cc",
ThumbURL: "https://img.example/thumb/1203058.jpg",
}); got != "1203058" {
t.Fatalf("sourceIDForItem(thumb url) = %q, want 1203058", got)
}
if got := sourceIDForItem(spiderVideoEntry{
Viewkey: "86fd91cce1f2e1a154cc",
SourceID: "not-numeric",
VideoURL: "https://cdn.example/videos/video.mp4",
}); got != "" {
t.Fatalf("sourceIDForItem(non numeric) = %q, want empty", got)
}
}
func TestNormalizeThumbURLForSource(t *testing.T) {
got := normalizeThumbURLForSource("https://img.example/thumb/856305.jpg?x=1#frag", "1203058")
want := "https://img.example/thumb/1203058.jpg"
if got != want {
t.Fatalf("normalizeThumbURLForSource = %q, want %q", got, want)
}
}
func TestSpider91ListURLForDetail(t *testing.T) {
got := spider91ListURLForDetail("https://www.91porn.com/view_video.php?viewkey=abc&page=5&c=furum&viewtype=basic&category=top")
want := "https://www.91porn.com/v.php?category=top&page=5&viewtype=basic"
if got != want {
t.Fatalf("spider91ListURLForDetail = %q, want %q", got, want)
}
if got := spider91ListURLForDetail("http://127.0.0.1/v.php?viewkey=abc&page=5&viewtype=basic&category=top"); got != "" {
t.Fatalf("spider91ListURLForDetail(localhost) = %q, want empty", got)
}
}
func TestSpider91CookieHeader(t *testing.T) {
got := spider91CookieHeader([]*http.Cookie{
{Name: "CLIPSHARE", Value: "abc"},
{Name: "ga", Value: "def"},
{Name: "mode", Value: "m"},
})
want := "mode=d; CLIPSHARE=abc; ga=def"
if got != want {
t.Fatalf("spider91CookieHeader = %q, want %q", got, want)
}
}
func spider91DetailHTML(videoURL string) string {
fragment := `<video><source src="` + videoURL + `" type="video/mp4"></video>`
return `document.write(strencode2("` + url.PathEscape(fragment) + `"));`
}
func seedCrawlerTestDeps(t *testing.T, tmp, driveID string, entries []map[string]string) (*catalog.Catalog, *Driver, string) {
t.Helper()
scriptPath := filepath.Join(tmp, driveID+"-fake.sh")
if err := os.WriteFile(scriptPath, []byte(buildFakeSpiderScript(entries)), 0o755); err != nil {
t.Fatalf("write script: %v", err)
}
cat, err := catalog.Open(filepath.Join(tmp, driveID+".db"))
if err != nil {
t.Fatalf("catalog: %v", err)
}
t.Cleanup(func() {
if err := cat.Close(); err != nil {
t.Fatalf("close catalog: %v", err)
}
})
drv := New(Config{ID: driveID, RootDir: filepath.Join(tmp, "spider91", driveID)})
if err := cat.UpsertDrive(context.Background(), &catalog.Drive{
ID: driveID, Kind: Kind, Name: driveID,
}); err != nil {
t.Fatalf("upsert drive: %v", err)
}
return cat, drv, scriptPath
}
// buildFakeSpiderScript 生成一个伪 python 脚本(其实是 sh)。
//
// 行为:
+4 -4
View File
@@ -4,7 +4,7 @@
// 与其它 drive 不同的是:
// - 数据来源不是云盘 API,而是 Python 子进程跑 spider_91porn.py 后下载到本地
// - StreamURL 直接返回本地文件路径,由 api.handleSpider91Video 用 http.ServeFile 服务
// - List/Stat 用于 GC 兜底(按 viewkey 列出 videos/ 目录)
// - List/Stat 用于 GC 兜底(按本地文件名列出 videos/ 目录)
package spider91
import (
@@ -77,12 +77,12 @@ func (d *Driver) ThumbsDir() string { return filepath.Join(d.rootDir, "thumbs")
// RootDir 返回 driver 的存储根。
func (d *Driver) RootDir() string { return d.rootDir }
// VideoPath 返回某个 viewkey 对应的视频文件绝对路径,并校验路径不会逃出 videos/ 目录。
// VideoPath 返回某个视频文件绝对路径,并校验路径不会逃出 videos/ 目录。
func (d *Driver) VideoPath(fileID string) (string, error) {
return safeJoin(d.VideosDir(), fileID)
}
// ThumbPath 返回某个 viewkey 对应的封面文件绝对路径。
// ThumbPath 返回某个封面文件绝对路径。
func (d *Driver) ThumbPath(fileID string) (string, error) {
return safeJoin(d.ThumbsDir(), fileID)
}
@@ -117,7 +117,7 @@ func (d *Driver) List(ctx context.Context, dirID string) ([]drives.Entry, error)
return out, nil
}
// Stat 查询单个 viewkey 视频文件的元数据。
// Stat 查询单个视频文件的元数据。
func (d *Driver) Stat(ctx context.Context, fileID string) (*drives.Entry, error) {
path, err := d.VideoPath(fileID)
if err != nil {