Files
91/backend/internal/drives/spider91/crawler.go
T
2026-06-03 20:35:53 +08:00

1071 lines
33 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package spider91
import (
"bufio"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"log"
"net"
"net/http"
"net/url"
"os"
"os/exec"
"path"
"path/filepath"
"regexp"
"strings"
"sync"
"time"
"github.com/video-site/backend/internal/catalog"
"github.com/video-site/backend/internal/mediaasset"
"golang.org/x/net/proxy"
)
// 默认 author/tag 标签,便于在前端筛选 spider91 来源的视频。
const DefaultAuthor = "91porn"
const DefaultTag = "91porn"
// DefaultTargetNew 是凌晨任务默认的"凑够这么多新视频"目标数。
const DefaultTargetNew = 15
// 视频下载、列表页请求的 UA 沿用爬虫脚本里那一套,避免触发 Cloudflare 风控。
const downloadUA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
// CrawlerConfig 是 Crawler 的依赖注入。
type CrawlerConfig struct {
// Driver 是已挂载的 spider91 drivercrawler 用它的 VideoPath / ThumbPath 写入文件。
Driver *Driver
// Catalog 用于查重和入库。
Catalog *catalog.Catalog
// PythonPath 是用来跑爬虫脚本的解释器,通常是 "python3"。
PythonPath string
// ScriptPath 是 spider_91porn.py 的绝对路径。
ScriptPath string
// WorkDir 是跑 Python 时的 cwd;为空表示沿用当前进程工作目录。
WorkDir string
// CommonThumbDir 是 backend 的 data/previews/thumbs 目录;
// crawler 会把封面再复制一份到 <CommonThumbDir>/<videoID>.jpg
// 让 /p/thumb/{videoID} 路由命中本地文件。
CommonThumbDir string
// HTTPClient 用于下载视频和封面;为空时使用内置默认 client。
HTTPClient *http.Client
// ProxyURL 可选的下载代理 URL(如 "http://127.0.0.1:7890")。
// 不为空则用它作为 HTTP/HTTPS 代理;为空则走 http.ProxyFromEnvironment(读 HTTPS_PROXY / HTTP_PROXY / NO_PROXY)。
// 91porn CDN 节点位于海外,国内服务器直连通常很慢,需要走代理。
ProxyURL string
// SpiderTimeout 限制单次爬虫脚本运行时间。
SpiderTimeout time.Duration
// DownloadTimeout 限制单条视频/封面下载的耗时。
DownloadTimeout time.Duration
// OnNewVideo 是新视频成功入库后的回调,用于触发预览视频 worker。
OnNewVideo func(v *catalog.Video)
}
// Crawler 把 Python 爬虫产出包装成 catalog 入库流程。
type Crawler struct {
cfg CrawlerConfig
// runMu 保证同一个 Crawler 实例不会并发跑两次。
runMu sync.Mutex
}
// NewCrawler 构造 Crawler。
func NewCrawler(cfg CrawlerConfig) *Crawler {
if cfg.SpiderTimeout <= 0 {
cfg.SpiderTimeout = 15 * time.Minute
}
if cfg.DownloadTimeout <= 0 {
cfg.DownloadTimeout = 30 * time.Minute
}
if cfg.HTTPClient == nil {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
ResponseHeaderTimeout: 60 * time.Second,
MaxIdleConns: 10,
IdleConnTimeout: 90 * time.Second,
}
if err := configureExplicitProxy(transport, cfg.ProxyURL); err != nil {
log.Printf("[spider91] invalid configured proxy URL, falling back to env: %v", err)
}
cfg.HTTPClient = &http.Client{
// 不限制总下载时长,靠 ctx 控制;只挡 dial / handshake / header
Timeout: 0,
Transport: transport,
}
}
return &Crawler{cfg: cfg}
}
func configureExplicitProxy(transport *http.Transport, raw string) error {
proxyURL := strings.TrimSpace(raw)
if proxyURL == "" {
return nil
}
u, err := url.Parse(proxyURL)
if err != nil || u.Scheme == "" || u.Host == "" {
return fmt.Errorf("invalid proxy URL")
}
switch strings.ToLower(u.Scheme) {
case "http", "https":
transport.Proxy = http.ProxyURL(u)
transport.DialContext = nil
return nil
case "socks5", "socks5h":
dialContext, err := socksProxyDialContext(u)
if err != nil {
return err
}
transport.Proxy = nil
transport.DialContext = dialContext
return nil
default:
return fmt.Errorf("unsupported proxy scheme %q", u.Scheme)
}
}
func socksProxyDialContext(proxyURL *url.URL) (func(context.Context, string, string) (net.Conn, error), error) {
var auth *proxy.Auth
if proxyURL.User != nil {
username := proxyURL.User.Username()
password, _ := proxyURL.User.Password()
auth = &proxy.Auth{User: username, Password: password}
}
dialer, err := proxy.SOCKS5("tcp", proxyURL.Host, auth, &net.Dialer{Timeout: 60 * time.Second})
if err != nil {
return nil, err
}
remoteDNS := strings.EqualFold(proxyURL.Scheme, "socks5h")
return func(ctx context.Context, network, addr string) (net.Conn, error) {
target := addr
if !remoteDNS {
resolved, err := resolveSocksTarget(ctx, addr)
if err != nil {
return nil, err
}
target = resolved
}
if ctxDialer, ok := dialer.(proxy.ContextDialer); ok {
return ctxDialer.DialContext(ctx, network, target)
}
type result struct {
conn net.Conn
err error
}
ch := make(chan result, 1)
go func() {
conn, err := dialer.Dial(network, target)
ch <- result{conn: conn, err: err}
}()
select {
case <-ctx.Done():
return nil, ctx.Err()
case res := <-ch:
return res.conn, res.err
}
}, nil
}
func resolveSocksTarget(ctx context.Context, addr string) (string, error) {
host, port, err := net.SplitHostPort(addr)
if err != nil || net.ParseIP(host) != nil {
return addr, nil
}
ips, err := net.DefaultResolver.LookupIPAddr(ctx, host)
if err != nil {
return "", err
}
ip := selectSocksTargetIP(ips)
if ip == nil {
return "", fmt.Errorf("resolve %s: no address", host)
}
return net.JoinHostPort(ip.String(), port), nil
}
func selectSocksTargetIP(ips []net.IPAddr) net.IP {
for _, addr := range ips {
if ip4 := addr.IP.To4(); ip4 != nil {
return ip4
}
}
for _, addr := range ips {
if addr.IP != nil {
return addr.IP
}
}
return nil
}
// CrawlResult 汇总一次 RunOnce 的结果。
type CrawlResult struct {
// TargetNew 是本次 RunOnce 的目标新增数(来自 drive.Credentials.target_new)。
TargetNew int
// TotalEntries 是 Python 输出 JSON 里的视频条数(已被 spider 端去重过的新视频)。
TotalEntries int
// NewVideos 是真正下载完并入库的新视频数。
NewVideos int
// Skipped 是 Go 侧二次校验时发现已存在的(理论上 Python 侧已经过滤过,正常情况下应为 0)。
Skipped int
// Failed 是下载或入库失败的条数。
Failed int
// SeenSnapshot 调用 Python 时实际写出的已知视频 ID 数量。
SeenSnapshot int
StartedAt time.Time
FinishedAt time.Time
OutputJSON string
SeenFile string
}
// spiderVideoEntry 对应 spider_91porn.py 输出 JSON 中的单条视频。
type spiderVideoEntry struct {
Title string `json:"title"`
ThumbURL string `json:"thumb_url"`
VideoURL string `json:"video_url"`
Viewkey string `json:"viewkey"`
SourceID string `json:"source_id"`
DetailURL string `json:"detail_url"`
}
// RunOnce 执行一次"跑爬虫 → 下载 → 入库"流程:
// 1. 从 catalog 拉取本 drive 已存在的 91 源视频 ID 列表,写到临时文件
// 2. 启动 Python 爬虫(--target-new + --seen-viewkeys-file + --stream-output),
// Python 每解析出一个 video 直链就把 entry 当作一行 JSON 写到 stdout。
// 3. Go 端 bufio.Scanner 按行读:每行立即下载视频和封面、入库。
// 这样 "Python 翻页找下一个" 与 "Go 下载当前一个" 在时间上重叠,缩短整轮耗时;
// 更重要的是不会让前几个下载耽误后面签名链接 e= 过期。
// 4. 全部消费完 + 子进程退出 → 返回 CrawlResult。预览视频不在此处入队,
// 由调用方 (App.runSpider91Crawl) 在 RunOnce 后统一调 enqueueDriveGeneration。
//
// targetNew <= 0 会被规范化成 spider91DefaultTargetNew15)。
func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, error) {
c.runMu.Lock()
defer c.runMu.Unlock()
if c.cfg.Driver == nil {
return nil, errors.New("spider91 crawler: driver not set")
}
if c.cfg.Catalog == nil {
return nil, errors.New("spider91 crawler: catalog not set")
}
if strings.TrimSpace(c.cfg.PythonPath) == "" || strings.TrimSpace(c.cfg.ScriptPath) == "" {
return nil, errors.New("spider91 crawler: python_path / script_path required")
}
if _, err := os.Stat(c.cfg.ScriptPath); err != nil {
return nil, fmt.Errorf("spider91 crawler: script not found: %w", err)
}
if targetNew <= 0 {
targetNew = DefaultTargetNew
}
if err := c.cfg.Driver.Init(ctx); err != nil {
return nil, fmt.Errorf("spider91 crawler: driver init: %w", err)
}
result := &CrawlResult{TargetNew: targetNew, StartedAt: time.Now()}
defer func() { result.FinishedAt = time.Now() }()
// 1. 准备 .crawl/ 目录 + 已知源视频 ID 列表
//
// 关键:路径必须用绝对路径,因为 Python 子进程的 cwd 我们设成了脚本所在目录
// (为了让 Python 用 site-packages 里的 requests 等),传相对路径会被 Python
// 当作相对它自己的 cwd 来解释,落在错的目录下,Go 这边再回头找又找不到。
rootDir, err := filepath.Abs(c.cfg.Driver.RootDir())
if err != nil {
return result, fmt.Errorf("spider91 crawler: abs root dir: %w", err)
}
crawlDir := filepath.Join(rootDir, ".crawl")
if err := os.MkdirAll(crawlDir, 0o755); err != nil {
return result, fmt.Errorf("spider91 crawler: mkdir crawl: %w", err)
}
timestamp := time.Now().UTC().Format("20060102T150405Z")
outputPath := filepath.Join(crawlDir, fmt.Sprintf("target-%d-%s.json", targetNew, timestamp))
seenPath := filepath.Join(crawlDir, fmt.Sprintf("seen-%s.txt", timestamp))
result.OutputJSON = outputPath
result.SeenFile = seenPath
seenCount, err := c.writeSeenViewkeys(ctx, seenPath)
if err != nil {
return result, fmt.Errorf("spider91 crawler: build seen list: %w", err)
}
result.SeenSnapshot = seenCount
// 2-3. 启动 Python 爬虫(流式 stdout 协议),并边读边处理。
//
// 协议:Python 每解析出一个 video 的直链就把 entry JSON 写到 stdout 一行,
// 立即 flush;本端 bufio.Scanner 收到一行就立即 processOne 下载视频和封面。
// 这样把 "Python 等所有视频解析完 + Go 顺序下载 N 个" 重叠成 "Python 翻页找下一个的同时
// Go 在下载当前一个",缩短总耗时;更重要的是把每条直链 e= 过期时间窗用满 ——
// 不会因为 Go 在下前面 7 个时让后面 8 个的签名超时。
cmd, stdout, err := c.startSpiderTargetNew(ctx, targetNew, seenPath, outputPath)
if err != nil {
return result, fmt.Errorf("spider91 crawler: spider start: %w", err)
}
scanner := bufio.NewScanner(stdout)
scanner.Buffer(make([]byte, 64*1024), 4*1024*1024) // 单条 entry 远小于 4 MB;保险加大上限
for scanner.Scan() {
if err := ctx.Err(); err != nil {
_ = cmd.Process.Kill()
break
}
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
var item spiderVideoEntry
if jerr := json.Unmarshal([]byte(line), &item); jerr != nil {
log.Printf("[spider91] drive=%s stdout parse: %v line=%q", c.cfg.Driver.ID(), jerr, line)
continue
}
result.TotalEntries++
sourceID := sourceIDForItem(item)
if sourceID == "" || strings.TrimSpace(item.VideoURL) == "" {
result.Failed++
continue
}
if result.NewVideos >= targetNew {
// Python 侧已用 target_new 控制;这里再兜底防止脚本异常多输出
break
}
videoID := buildVideoID(c.cfg.Driver.ID(), sourceID)
if existing, _ := c.cfg.Catalog.GetVideo(ctx, videoID); existing != nil {
result.Skipped++
continue
}
if perr := c.processOne(ctx, videoID, item); perr != nil {
log.Printf("[spider91] drive=%s viewkey=%s source_id=%s failed: %v", c.cfg.Driver.ID(), item.Viewkey, sourceID, perr)
result.Failed++
continue
}
result.NewVideos++
}
if scerr := scanner.Err(); scerr != nil {
log.Printf("[spider91] drive=%s stdout scan: %v", c.cfg.Driver.ID(), scerr)
}
if werr := cmd.Wait(); werr != nil {
// 子进程被我们 Kill 是预期;其它错误(exit code != 0)记录日志但不当致命错误,
// 因为流式模式下 stdout 已读完,能拿到的视频已经处理。
if ctx.Err() == nil {
log.Printf("[spider91] drive=%s spider exit: %v", c.cfg.Driver.ID(), werr)
}
}
return result, nil
}
// writeSeenViewkeys 把当前 drive 下已入库的 91 源视频 ID 写到 path,供 Python 脚本读取。
//
// 注意:不能用 ListVideoFileIDsByDrive(按 drive_id 查),因为 spider91
// 视频被 spider91migrate 迁移到 PikPak 后 drive_id 已经不再是这个 drive。
// 改用 ListSpider91Viewkeys:它按 video.id 前缀("spider91-<driveID>-")查,
// 不受迁移影响。函数名保留历史叫法,实际返回的是 ID 后缀;新数据使用 mp4 源 ID。
func (c *Crawler) writeSeenViewkeys(ctx context.Context, path string) (int, error) {
seenIDs, err := c.cfg.Catalog.ListSpider91Viewkeys(ctx, c.cfg.Driver.ID())
if err != nil {
return 0, err
}
seen := make(map[string]struct{}, len(seenIDs))
for _, id := range seenIDs {
id = strings.TrimSpace(id)
if id == "" {
continue
}
seen[id] = struct{}{}
}
tmp := path + ".part"
f, err := os.OpenFile(tmp, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o644)
if err != nil {
return 0, err
}
for id := range seen {
if _, err := f.WriteString(id + "\n"); err != nil {
_ = f.Close()
_ = os.Remove(tmp)
return 0, err
}
}
if err := f.Close(); err != nil {
_ = os.Remove(tmp)
return 0, err
}
if err := os.Rename(tmp, path); err != nil {
_ = os.Remove(tmp)
return 0, err
}
return len(seen), nil
}
// runSpiderTargetNew 启动 Python 子进程(--target-new + --seen-viewkeys-file
// + --stream-output)。返回 cmd 和 stdout 的 reader;调用方按行 JSON 消费 stdout
// 每读到一行就立即 processOne,下完再读下一行。Python 的日志被引到 stderr
// 由本函数转发到 backend log,不影响 stdout 的 JSONL 协议。
//
// 使用方负责调 cmd.Wait(),并 close stdout reader。
func (c *Crawler) startSpiderTargetNew(ctx context.Context, targetNew int, seenPath, outputPath string) (*exec.Cmd, io.ReadCloser, error) {
args := []string{
c.cfg.ScriptPath,
"--target-new", fmt.Sprintf("%d", targetNew),
"--seen-viewkeys-file", seenPath,
"--output", outputPath,
"--no-resume",
"--quiet",
"--stream-output",
}
// 子进程的 ctx 走外层 ctx 即可,不再额外加 SpiderTimeout —— 流式模式下
// 单个视频的下载在 Go 端做超时控制(DownloadTimeout);爬虫脚本主要时间在
// 列表/详情页 + 网络等待,整轮上限通过外层 ctx 控制更准确。
cmd := exec.CommandContext(ctx, c.cfg.PythonPath, args...)
if c.cfg.WorkDir != "" {
cmd.Dir = c.cfg.WorkDir
}
if proxyURL := strings.TrimSpace(c.cfg.ProxyURL); proxyURL != "" {
cmd.Env = append(os.Environ(),
"HTTP_PROXY="+proxyURL,
"HTTPS_PROXY="+proxyURL,
"http_proxy="+proxyURL,
"https_proxy="+proxyURL,
"NO_PROXY=",
"no_proxy=",
)
}
stdout, err := cmd.StdoutPipe()
if err != nil {
return nil, nil, fmt.Errorf("stdout pipe: %w", err)
}
stderr, err := cmd.StderrPipe()
if err != nil {
_ = stdout.Close()
return nil, nil, fmt.Errorf("stderr pipe: %w", err)
}
log.Printf("[spider91] drive=%s exec %s --target-new=%d --seen=%s --output=%s",
c.cfg.Driver.ID(), c.cfg.ScriptPath, targetNew, seenPath, outputPath)
if err := cmd.Start(); err != nil {
_ = stdout.Close()
_ = stderr.Close()
return nil, nil, fmt.Errorf("start: %w", err)
}
// stderr 转发到 backend log。子进程退出时 reader 自动 EOFgoroutine 自然结束。
go forwardSpiderLog(c.cfg.Driver.ID(), stderr)
return cmd, stdout, nil
}
// forwardSpiderLog 把 Python stderr 逐行转发到 backend log,便于调试。
func forwardSpiderLog(driveID string, r io.Reader) {
scanner := bufio.NewScanner(r)
scanner.Buffer(make([]byte, 64*1024), 1024*1024)
for scanner.Scan() {
line := scanner.Text()
if strings.TrimSpace(line) == "" {
continue
}
log.Printf("[spider91:py] drive=%s %s", driveID, line)
}
}
// processOne 处理单个 91 源视频:下载视频 + 封面 + 复制封面 + 入库。
// 任一步失败会清理已写入的临时文件,不留半成品。
func (c *Crawler) processOne(ctx context.Context, videoID string, item spiderVideoEntry) error {
viewkey := item.Viewkey
sourceID := sourceIDForItem(item)
if sourceID == "" {
return errors.New("empty numeric source id")
}
videoURL := strings.TrimSpace(item.VideoURL)
videoSourceID := sourceIDFromVideoURL(videoURL)
if videoSourceID == "" {
return fmt.Errorf("video url has no numeric source id: %s", videoURL)
}
if videoSourceID != sourceID {
return fmt.Errorf("video source id mismatch: got %s want %s", videoSourceID, sourceID)
}
thumbURL := normalizeThumbURLForSource(item.ThumbURL, sourceID)
// 视频文件后缀按直链 URL 真实后缀来定,避免直链返回的不是 mp4 时存错容器。
videoExt := detectVideoExt(videoURL)
videoFile := sourceID + videoExt
// 封面后缀同理,但 91porn 的封面绝大多数是 jpg;URL 提示其它格式时尊重之。
thumbFile := sourceID + detectThumbExt(thumbURL)
videoPath, err := c.cfg.Driver.VideoPath(videoFile)
if err != nil {
return err
}
thumbPath, err := c.cfg.Driver.ThumbPath(thumbFile)
if err != nil {
return err
}
// 视频先下载(必须);失败直接退出。
videoSize, err := c.downloadVideoAtomicWithRefresh(ctx, item, videoPath, videoURL, sourceID)
if err != nil {
return fmt.Errorf("download video: %w", err)
}
// 封面下载失败不致命,视频本身仍入库;下方在 UpsertVideo 后会把
// thumbnail_status 显式标 'failed'spider91 drive 的 thumb worker 按设计
// 不处理 spider91 视频,没人能"兜底")。
thumbReady := false
if strings.TrimSpace(thumbURL) != "" {
thumbCtx, cancel := c.downloadAttemptContext(ctx)
_, err := c.downloadAtomic(thumbCtx, thumbURL, thumbPath, item.DetailURL)
cancel()
if err != nil {
log.Printf("[spider91] drive=%s viewkey=%s source_id=%s thumb download failed: %v", c.cfg.Driver.ID(), viewkey, sourceID, err)
} else {
thumbReady = true
}
}
// 把封面复制到 backend 的标准 thumbs 目录,让 /p/thumb/{videoID} 直接命中。
if thumbReady && c.cfg.CommonThumbDir != "" {
if err := os.MkdirAll(c.cfg.CommonThumbDir, 0o755); err != nil {
log.Printf("[spider91] drive=%s mkdir common thumbs: %v", c.cfg.Driver.ID(), err)
thumbReady = false
} else {
dst := mediaasset.ThumbnailPathInDir(c.cfg.CommonThumbDir, videoID)
if err := copyFileAtomic(thumbPath, dst); err != nil {
log.Printf("[spider91] drive=%s viewkey=%s source_id=%s copy thumb to common dir: %v", c.cfg.Driver.ID(), viewkey, sourceID, err)
thumbReady = false
}
}
}
title := strings.TrimSpace(item.Title)
if title == "" {
title = sourceID
}
tags := []string{DefaultTag}
if matched, err := c.cfg.Catalog.MatchTags(ctx, title+" "+DefaultAuthor); err == nil {
tags = mergeCatalogTags(tags, matched)
} else {
log.Printf("[spider91] drive=%s viewkey=%s source_id=%s match tags: %v", c.cfg.Driver.ID(), viewkey, sourceID, err)
}
// 入库
now := time.Now()
v := &catalog.Video{
ID: videoID,
DriveID: c.cfg.Driver.ID(),
FileID: videoFile,
FileName: videoFile,
Title: title,
Author: DefaultAuthor,
Tags: tags,
Ext: strings.TrimPrefix(videoExt, "."),
Quality: "HD",
Size: videoSize,
PreviewStatus: "pending",
PublishedAt: now,
CreatedAt: now,
UpdatedAt: now,
}
if thumbReady {
// 设了 ThumbnailURL 后 thumb worker 会跳过这条视频,
// 不再尝试用 ffmpeg 抽帧(封面已经是网站原图)。
v.ThumbnailURL = "/p/thumb/" + v.ID
}
if err := c.cfg.Catalog.UpsertVideo(ctx, v); err != nil {
// 入库失败 → 把刚下载的文件清理掉,避免占盘且下次还要清
_ = os.Remove(videoPath)
_ = os.Remove(thumbPath)
return fmt.Errorf("upsert video: %w", err)
}
if !thumbReady {
// 网站封面下载失败的视频:spider91 drive 的 thumb worker 按设计不
// 处理 spider91 视频(封面应是网站原图直接保存),所以没人接手。
// 显式标 'failed' 让 CountVideosNeedingThumbnail 排除(条件 status
// != 'failed'),避免后续封面补队列一直重复捞到这条视频。
_ = c.cfg.Catalog.UpdateVideoMeta(ctx, v.ID, catalog.VideoMetaPatch{
ThumbnailStatus: "failed",
})
}
if c.cfg.OnNewVideo != nil {
c.cfg.OnNewVideo(v)
}
log.Printf("[spider91] drive=%s viewkey=%s source_id=%s ok title=%q size=%d", c.cfg.Driver.ID(), viewkey, sourceID, v.Title, v.Size)
return nil
}
func (c *Crawler) downloadVideoAtomicWithRefresh(ctx context.Context, item spiderVideoEntry, dst, firstURL, expectedSourceID string) (int64, error) {
videoURL := strings.TrimSpace(firstURL)
if videoURL == "" {
videoURL = strings.TrimSpace(item.VideoURL)
}
var lastErr error
for attempt := 1; attempt <= 3; attempt++ {
attemptCtx, cancel := c.downloadAttemptContext(ctx)
size, err := c.downloadAtomic(attemptCtx, videoURL, dst, item.DetailURL)
cancel()
if err == nil {
return size, nil
}
lastErr = err
if ctx.Err() != nil || !shouldRefreshSpider91VideoURL(err) {
return 0, err
}
fresh, refreshErr := c.resolveFreshVideoURL(ctx, item)
if refreshErr != nil {
return 0, fmt.Errorf("%w; refresh video url: %v", err, refreshErr)
}
if fresh == "" || fresh == videoURL {
return 0, err
}
freshSourceID := sourceIDFromVideoURL(fresh)
if freshSourceID == "" {
return 0, fmt.Errorf("%w; refreshed video url has no numeric source id: %s", err, fresh)
}
if expectedSourceID != "" && freshSourceID != expectedSourceID {
return 0, fmt.Errorf("%w; refreshed video source id mismatch: got %s want %s", err, freshSourceID, expectedSourceID)
}
_ = os.Remove(dst + ".part")
log.Printf("[spider91] drive=%s viewkey=%s source_id=%s download attempt=%d failed (%v); refreshed video url and retrying",
c.cfg.Driver.ID(), item.Viewkey, expectedSourceID, attempt, err)
videoURL = fresh
}
return 0, lastErr
}
func (c *Crawler) downloadAttemptContext(ctx context.Context) (context.Context, context.CancelFunc) {
if c.cfg.DownloadTimeout <= 0 {
return ctx, func() {}
}
return context.WithTimeout(ctx, c.cfg.DownloadTimeout)
}
// downloadAtomic 下载 url 到 dst,先写到 dst.part 再 rename,避免半截文件。
// 返回最终文件大小。
func (c *Crawler) downloadAtomic(ctx context.Context, src, dst, referer string) (int64, error) {
if strings.TrimSpace(src) == "" {
return 0, errors.New("empty url")
}
if _, err := url.Parse(src); err != nil {
return 0, fmt.Errorf("parse url: %w", err)
}
if err := os.MkdirAll(filepath.Dir(dst), 0o755); err != nil {
return 0, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, src, nil)
if err != nil {
return 0, err
}
req.Header.Set("User-Agent", downloadUA)
if referer != "" {
req.Header.Set("Referer", referer)
}
resp, err := c.cfg.HTTPClient.Do(req)
if err != nil {
return 0, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return 0, &downloadHTTPError{StatusCode: resp.StatusCode}
}
tmp := dst + ".part"
out, err := os.OpenFile(tmp, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o644)
if err != nil {
return 0, err
}
written, copyErr := io.Copy(out, resp.Body)
closeErr := out.Close()
if copyErr != nil {
_ = os.Remove(tmp)
return 0, copyErr
}
if closeErr != nil {
_ = os.Remove(tmp)
return 0, closeErr
}
if written <= 0 {
_ = os.Remove(tmp)
return 0, errors.New("empty body")
}
if err := os.Rename(tmp, dst); err != nil {
_ = os.Remove(tmp)
return 0, err
}
return written, nil
}
type downloadHTTPError struct {
StatusCode int
}
func (e *downloadHTTPError) Error() string {
if e == nil {
return "http error"
}
return fmt.Sprintf("http %d", e.StatusCode)
}
func shouldRefreshSpider91VideoURL(err error) bool {
if err == nil {
return false
}
if errors.Is(err, context.Canceled) {
return false
}
if errors.Is(err, context.DeadlineExceeded) {
return true
}
var httpErr *downloadHTTPError
if errors.As(err, &httpErr) {
switch httpErr.StatusCode {
case http.StatusForbidden, http.StatusNotFound, http.StatusGone, http.StatusRequestedRangeNotSatisfiable,
http.StatusTooManyRequests, http.StatusInternalServerError, http.StatusBadGateway, http.StatusServiceUnavailable, http.StatusGatewayTimeout:
return true
default:
return false
}
}
text := strings.ToLower(err.Error())
return strings.Contains(text, "unexpected eof") ||
strings.Contains(text, "connection reset") ||
strings.Contains(text, "connection refused") ||
strings.Contains(text, "broken pipe") ||
strings.Contains(text, "server closed") ||
strings.Contains(text, "timeout")
}
func (c *Crawler) resolveFreshVideoURL(ctx context.Context, item spiderVideoEntry) (string, error) {
detailURL := strings.TrimSpace(item.DetailURL)
if detailURL == "" {
return "", errors.New("empty detail url")
}
cookieHeader := "mode=d"
if warmURL := spider91ListURLForDetail(detailURL); warmURL != "" {
if cookies, err := c.fetchSpider91WarmCookies(ctx, warmURL, detailURL); err == nil {
cookieHeader = spider91CookieHeader(cookies)
} else {
log.Printf("[spider91] drive=%s viewkey=%s warm session failed: %v", c.cfg.Driver.ID(), item.Viewkey, err)
}
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, detailURL, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", downloadUA)
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9")
req.Header.Set("Cookie", cookieHeader)
resp, err := c.cfg.HTTPClient.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return "", &downloadHTTPError{StatusCode: resp.StatusCode}
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 4*1024*1024))
if err != nil {
return "", err
}
videoURL := parseSpider91VideoURL(string(body))
if videoURL == "" {
return "", errors.New("video url not found in detail page")
}
return videoURL, nil
}
func (c *Crawler) fetchSpider91WarmCookies(ctx context.Context, warmURL, referer string) ([]*http.Cookie, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, warmURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", downloadUA)
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9")
req.Header.Set("Cookie", "mode=d")
if referer != "" {
req.Header.Set("Referer", referer)
}
resp, err := c.cfg.HTTPClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
_, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 64*1024))
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return nil, &downloadHTTPError{StatusCode: resp.StatusCode}
}
return resp.Cookies(), nil
}
func spider91ListURLForDetail(detailURL string) string {
u, err := url.Parse(strings.TrimSpace(detailURL))
if err != nil || u == nil || u.Scheme == "" || u.Host == "" {
return ""
}
if !strings.Contains(strings.ToLower(u.Host), "91porn.com") {
return ""
}
q := u.Query()
page := strings.TrimSpace(q.Get("page"))
category := strings.TrimSpace(q.Get("category"))
viewtype := strings.TrimSpace(q.Get("viewtype"))
if page == "" || category == "" || viewtype == "" {
return ""
}
listURL := *u
listURL.Path = "/v.php"
listQuery := url.Values{}
listQuery.Set("category", category)
listQuery.Set("viewtype", viewtype)
listQuery.Set("page", page)
listURL.RawQuery = listQuery.Encode()
listURL.Fragment = ""
return listURL.String()
}
func spider91CookieHeader(cookies []*http.Cookie) string {
values := []string{"mode=d"}
seen := map[string]bool{"mode": true}
for _, cookie := range cookies {
if cookie == nil || strings.TrimSpace(cookie.Name) == "" || seen[cookie.Name] {
continue
}
seen[cookie.Name] = true
values = append(values, cookie.Name+"="+cookie.Value)
}
return strings.Join(values, "; ")
}
var (
strencode2RE = regexp.MustCompile(`strencode2\(["']([^"']+)["']\)`)
srcAttrRE = regexp.MustCompile(`src=['"]([^'"]+)['"]`)
mp4URLRE = regexp.MustCompile(`https?://[^\s"'<>]+\.mp4[^\s"'<>]*`)
)
func parseSpider91VideoURL(html string) string {
if html == "" {
return ""
}
if match := strencode2RE.FindStringSubmatch(html); len(match) == 2 {
if decoded, err := url.PathUnescape(match[1]); err == nil {
if src := srcAttrRE.FindStringSubmatch(decoded); len(src) == 2 {
return normalizeHTTPURLSlashes(src[1])
}
}
}
if match := mp4URLRE.FindString(html); match != "" {
lower := strings.ToLower(match)
if !strings.Contains(lower, "kwai") && !strings.Contains(lower, "ad-") {
return match
}
}
return ""
}
func normalizeHTTPURLSlashes(rawURL string) string {
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || u == nil || u.Scheme == "" || u.Host == "" {
return rawURL
}
for strings.Contains(u.Path, "//") {
u.Path = strings.ReplaceAll(u.Path, "//", "/")
}
return u.String()
}
func sourceIDForItem(item spiderVideoEntry) string {
if id := sanitizeSourceID(item.SourceID); isNumericSourceID(id) {
return id
}
if id := sourceIDFromVideoURL(item.VideoURL); id != "" {
return id
}
if id := sourceIDFromThumbURL(item.ThumbURL); id != "" {
return id
}
return ""
}
func sourceIDFromVideoURL(rawURL string) string {
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || u == nil {
return ""
}
base := path.Base(u.Path)
ext := strings.ToLower(path.Ext(base))
switch ext {
case ".mp4", ".m4v", ".mov", ".webm", ".mkv", ".avi", ".flv":
default:
return ""
}
id := sanitizeSourceID(strings.TrimSuffix(base, ext))
if !isNumericSourceID(id) {
return ""
}
return id
}
func sourceIDFromThumbURL(rawURL string) string {
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || u == nil {
return ""
}
base := path.Base(u.Path)
ext := strings.ToLower(path.Ext(base))
switch ext {
case ".jpg", ".jpeg", ".png", ".webp", ".gif":
default:
return ""
}
id := sanitizeSourceID(strings.TrimSuffix(base, ext))
if !isNumericSourceID(id) {
return ""
}
return id
}
func sanitizeSourceID(raw string) string {
raw = strings.TrimSpace(raw)
if raw == "" {
return ""
}
var b strings.Builder
for _, r := range raw {
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' || r == '-' {
b.WriteRune(r)
}
}
return b.String()
}
func isNumericSourceID(id string) bool {
if id == "" {
return false
}
for _, r := range id {
if r < '0' || r > '9' {
return false
}
}
return true
}
func normalizeThumbURLForSource(rawURL, sourceID string) string {
sourceID = sanitizeSourceID(sourceID)
if strings.TrimSpace(rawURL) == "" || sourceID == "" {
return rawURL
}
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || u == nil || u.Scheme == "" || u.Host == "" {
return rawURL
}
base := path.Base(u.Path)
ext := strings.ToLower(path.Ext(base))
switch ext {
case ".jpg", ".jpeg", ".png", ".webp", ".gif":
default:
return rawURL
}
dir := path.Dir(u.Path)
if dir == "." || dir == "/" || !strings.HasSuffix(dir, "/thumb") {
return rawURL
}
u.Path = path.Join(dir, sourceID+".jpg")
u.RawQuery = ""
u.Fragment = ""
return u.String()
}
// copyFileAtomic 把 src 复制到 dst,先写 .part 再 rename。
func copyFileAtomic(src, dst string) error {
in, err := os.Open(src)
if err != nil {
return err
}
defer in.Close()
if err := os.MkdirAll(filepath.Dir(dst), 0o755); err != nil {
return err
}
tmp := dst + ".part"
out, err := os.OpenFile(tmp, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o644)
if err != nil {
return err
}
if _, err := io.Copy(out, in); err != nil {
_ = out.Close()
_ = os.Remove(tmp)
return err
}
if err := out.Close(); err != nil {
_ = os.Remove(tmp)
return err
}
return os.Rename(tmp, dst)
}
func mergeCatalogTags(lists ...[]string) []string {
out := []string{}
seen := map[string]bool{}
for _, list := range lists {
for _, tag := range list {
tag = strings.TrimSpace(tag)
if tag == "" {
continue
}
key := strings.ToLower(tag)
if seen[key] {
continue
}
seen[key] = true
out = append(out, tag)
}
}
return out
}
// BuildVideoID 给定 driveID + 91 源视频 ID,按统一规则生成 catalog 中 videos.id。
// 与 scanner 用法一致:<kind>-<driveID>-<fileID>。
func BuildVideoID(driveID, sourceID string) string {
return buildVideoID(driveID, sourceID)
}
func buildVideoID(driveID, sourceID string) string {
return Kind + "-" + driveID + "-" + sourceID
}
// detectVideoExt 从直链 URL 推断视频文件后缀。
//
// 91porn 直链路径形如 https://.../mp43/xxxx.mp4?st=...path.Ext("xxxx.mp4") = ".mp4"。
// 但任何爬虫都可能拿到 .flv / .m3u8 / 没扩展名等情况;这里维护一个白名单:
// - .mp4 / .webm / .mkv / .mov / .m4v / .flv / .avi → 直接用
// - .m3u8 / .ts → 是流媒体清单,不能直接当单文件视频保存,回退到 .mp4,让上层察觉到下载结果异常
// - 其它 → .mp4 兜底
func detectVideoExt(rawURL string) string {
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || u == nil {
return ".mp4"
}
base := path.Base(u.Path)
ext := strings.ToLower(path.Ext(base))
switch ext {
case ".mp4", ".webm", ".mkv", ".mov", ".m4v", ".flv", ".avi":
return ext
}
return ".mp4"
}
// detectThumbExt 从封面 URL 推断后缀。默认 .jpg。
func detectThumbExt(rawURL string) string {
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || u == nil {
return ".jpg"
}
base := path.Base(u.Path)
ext := strings.ToLower(path.Ext(base))
switch ext {
case ".jpg", ".jpeg", ".png", ".webp", ".gif":
return ext
}
return ".jpg"
}