mirror of
https://github.com/nianzhibai/91.git
synced 2026-06-15 00:44:30 +08:00
c1355385e1
Redesign crawler management around imported Python scripts instead of built-in crawler storage. Crawler scripts now declare CRAWLER_NAME, imports validate metadata, crawler IDs are generated internally, and deleted crawler scripts are detached without deleting already imported videos. Add backend support for file and URL script imports, dry-run testing, metadata parsing, safer job paths, original filename preservation, and crawler listing that ignores detached script records. Remove the legacy built-in Spider91 script path flow and hidden Python/config JSON fields from the crawler API. Rework the admin crawler page into an independent crawler console with script import, dry-run testing, status metrics, spider iconography, and simplified controls. Update docs, examples, installer checks, Docker/release packaging, and tests for the new protocol.
1037 lines
29 KiB
Go
1037 lines
29 KiB
Go
package scriptcrawler
|
|
|
|
import (
|
|
"bufio"
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"os/exec"
|
|
"path"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/video-site/backend/internal/catalog"
|
|
"github.com/video-site/backend/internal/mediaasset"
|
|
"golang.org/x/net/proxy"
|
|
)
|
|
|
|
const (
|
|
DefaultTargetNew = 10
|
|
defaultUserAgent = "Mozilla/5.0 (compatible; video-site-91-scriptcrawler/1.0)"
|
|
)
|
|
|
|
type CrawlerConfig struct {
|
|
Driver *Driver
|
|
Catalog *catalog.Catalog
|
|
SourceKind string
|
|
PythonPath string
|
|
ScriptPath string
|
|
WorkDir string
|
|
CommonThumbDir string
|
|
ProxyURL string
|
|
ConfigJSON string
|
|
HTTPClient *http.Client
|
|
DownloadTimeout time.Duration
|
|
OnProgress func(CrawlProgress)
|
|
}
|
|
|
|
type Crawler struct {
|
|
cfg CrawlerConfig
|
|
runMu sync.Mutex
|
|
}
|
|
|
|
func NewCrawler(cfg CrawlerConfig) *Crawler {
|
|
if strings.TrimSpace(cfg.PythonPath) == "" {
|
|
cfg.PythonPath = "python3"
|
|
}
|
|
if cfg.DownloadTimeout <= 0 {
|
|
cfg.DownloadTimeout = 30 * time.Minute
|
|
}
|
|
if cfg.HTTPClient == nil {
|
|
transport := &http.Transport{
|
|
Proxy: http.ProxyFromEnvironment,
|
|
ResponseHeaderTimeout: 60 * time.Second,
|
|
MaxIdleConns: 10,
|
|
IdleConnTimeout: 90 * time.Second,
|
|
}
|
|
if err := configureExplicitProxy(transport, cfg.ProxyURL); err != nil {
|
|
log.Printf("[scriptcrawler] invalid configured proxy URL, falling back to env: %v", err)
|
|
}
|
|
cfg.HTTPClient = &http.Client{Transport: transport}
|
|
}
|
|
return &Crawler{cfg: cfg}
|
|
}
|
|
|
|
type CrawlResult struct {
|
|
TargetNew int
|
|
TotalEntries int
|
|
NewVideos int
|
|
Skipped int
|
|
Failed int
|
|
SeenSnapshot int
|
|
StartedAt time.Time
|
|
FinishedAt time.Time
|
|
JobFile string
|
|
SeenFile string
|
|
}
|
|
|
|
type CrawlProgress struct {
|
|
TargetNew int
|
|
TotalEntries int
|
|
NewVideos int
|
|
Skipped int
|
|
Failed int
|
|
SeenSnapshot int
|
|
Checked int
|
|
Emitted int
|
|
Message string
|
|
}
|
|
|
|
type Job struct {
|
|
Protocol string `json:"protocol"`
|
|
Mode string `json:"mode"`
|
|
RunID string `json:"run_id"`
|
|
CrawlerID string `json:"crawler_id"`
|
|
TargetNew int `json:"target_new"`
|
|
SeenSourceIDsFile string `json:"seen_source_ids_file"`
|
|
OutputDir string `json:"output_dir"`
|
|
Config json.RawMessage `json:"config"`
|
|
Network JobNetwork `json:"network"`
|
|
}
|
|
|
|
type JobNetwork struct {
|
|
ProxyURL string `json:"proxy_url,omitempty"`
|
|
}
|
|
|
|
type Event struct {
|
|
Type string `json:"type"`
|
|
Item Item `json:"item"`
|
|
SourceID string `json:"source_id,omitempty"`
|
|
Title string `json:"title,omitempty"`
|
|
MediaURL string `json:"media_url,omitempty"`
|
|
MediaLocalFile string `json:"media_local_file,omitempty"`
|
|
ThumbnailURL string `json:"thumbnail_url,omitempty"`
|
|
ThumbnailLocalFile string `json:"thumbnail_local_file,omitempty"`
|
|
DetailURL string `json:"detail_url,omitempty"`
|
|
Author string `json:"author,omitempty"`
|
|
Tags []string `json:"tags,omitempty"`
|
|
Category string `json:"category,omitempty"`
|
|
Quality string `json:"quality,omitempty"`
|
|
DurationSeconds int `json:"duration_seconds,omitempty"`
|
|
Description string `json:"description,omitempty"`
|
|
PublishedAt string `json:"published_at,omitempty"`
|
|
Headers map[string]string `json:"headers,omitempty"`
|
|
MediaHeaders map[string]string `json:"media_headers,omitempty"`
|
|
ThumbnailHeaders map[string]string `json:"thumbnail_headers,omitempty"`
|
|
Checked int `json:"checked,omitempty"`
|
|
Emitted int `json:"emitted,omitempty"`
|
|
Message string `json:"message,omitempty"`
|
|
Stats json.RawMessage `json:"stats,omitempty"`
|
|
}
|
|
|
|
type Item struct {
|
|
SourceID string `json:"source_id,omitempty"`
|
|
Title string `json:"title"`
|
|
MediaURL string `json:"media_url,omitempty"`
|
|
MediaLocalFile string `json:"media_local_file,omitempty"`
|
|
ThumbnailURL string `json:"thumbnail_url,omitempty"`
|
|
ThumbnailLocalFile string `json:"thumbnail_local_file,omitempty"`
|
|
DetailURL string `json:"detail_url,omitempty"`
|
|
Author string `json:"author,omitempty"`
|
|
Tags []string `json:"tags,omitempty"`
|
|
Category string `json:"category,omitempty"`
|
|
Quality string `json:"quality,omitempty"`
|
|
DurationSeconds int `json:"duration_seconds,omitempty"`
|
|
Description string `json:"description,omitempty"`
|
|
PublishedAt string `json:"published_at,omitempty"`
|
|
Headers map[string]string `json:"headers,omitempty"`
|
|
MediaHeaders map[string]string `json:"media_headers,omitempty"`
|
|
ThumbnailHeaders map[string]string `json:"thumbnail_headers,omitempty"`
|
|
Media MediaRef `json:"media,omitempty"`
|
|
Thumbnail MediaRef `json:"thumbnail,omitempty"`
|
|
}
|
|
|
|
type MediaRef struct {
|
|
URL string `json:"url,omitempty"`
|
|
LocalFile string `json:"local_file,omitempty"`
|
|
Headers map[string]string `json:"headers,omitempty"`
|
|
}
|
|
|
|
func (e Event) normalizedItem() Item {
|
|
item := e.Item
|
|
if strings.TrimSpace(item.SourceID) == "" {
|
|
item.SourceID = e.SourceID
|
|
}
|
|
if strings.TrimSpace(item.Title) == "" {
|
|
item.Title = e.Title
|
|
}
|
|
if strings.TrimSpace(item.MediaURL) == "" {
|
|
item.MediaURL = e.MediaURL
|
|
}
|
|
if strings.TrimSpace(item.MediaLocalFile) == "" {
|
|
item.MediaLocalFile = e.MediaLocalFile
|
|
}
|
|
if strings.TrimSpace(item.ThumbnailURL) == "" {
|
|
item.ThumbnailURL = e.ThumbnailURL
|
|
}
|
|
if strings.TrimSpace(item.ThumbnailLocalFile) == "" {
|
|
item.ThumbnailLocalFile = e.ThumbnailLocalFile
|
|
}
|
|
if strings.TrimSpace(item.DetailURL) == "" {
|
|
item.DetailURL = e.DetailURL
|
|
}
|
|
if strings.TrimSpace(item.Author) == "" {
|
|
item.Author = e.Author
|
|
}
|
|
if len(item.Tags) == 0 && len(e.Tags) > 0 {
|
|
item.Tags = e.Tags
|
|
}
|
|
if strings.TrimSpace(item.Category) == "" {
|
|
item.Category = e.Category
|
|
}
|
|
if strings.TrimSpace(item.Quality) == "" {
|
|
item.Quality = e.Quality
|
|
}
|
|
if item.DurationSeconds == 0 {
|
|
item.DurationSeconds = e.DurationSeconds
|
|
}
|
|
if strings.TrimSpace(item.Description) == "" {
|
|
item.Description = e.Description
|
|
}
|
|
if strings.TrimSpace(item.PublishedAt) == "" {
|
|
item.PublishedAt = e.PublishedAt
|
|
}
|
|
if len(item.Headers) == 0 && len(e.Headers) > 0 {
|
|
item.Headers = e.Headers
|
|
}
|
|
if len(item.MediaHeaders) == 0 && len(e.MediaHeaders) > 0 {
|
|
item.MediaHeaders = e.MediaHeaders
|
|
}
|
|
if len(item.ThumbnailHeaders) == 0 && len(e.ThumbnailHeaders) > 0 {
|
|
item.ThumbnailHeaders = e.ThumbnailHeaders
|
|
}
|
|
return item
|
|
}
|
|
|
|
func (item Item) hasPayload() bool {
|
|
return strings.TrimSpace(item.Title) != "" ||
|
|
strings.TrimSpace(item.SourceID) != "" ||
|
|
strings.TrimSpace(item.MediaURL) != "" ||
|
|
strings.TrimSpace(item.MediaLocalFile) != "" ||
|
|
strings.TrimSpace(item.Media.URL) != "" ||
|
|
strings.TrimSpace(item.Media.LocalFile) != ""
|
|
}
|
|
|
|
func (c *Crawler) RunOnce(ctx context.Context, targetNew int) (*CrawlResult, error) {
|
|
c.runMu.Lock()
|
|
defer c.runMu.Unlock()
|
|
|
|
if c.cfg.Driver == nil {
|
|
return nil, errors.New("scriptcrawler: driver not set")
|
|
}
|
|
if c.cfg.Catalog == nil {
|
|
return nil, errors.New("scriptcrawler: catalog not set")
|
|
}
|
|
if strings.TrimSpace(c.cfg.ScriptPath) == "" {
|
|
return nil, errors.New("scriptcrawler: script_path is required")
|
|
}
|
|
if _, err := os.Stat(c.cfg.ScriptPath); err != nil {
|
|
return nil, fmt.Errorf("scriptcrawler: script not found: %w", err)
|
|
}
|
|
if targetNew <= 0 {
|
|
targetNew = DefaultTargetNew
|
|
}
|
|
if err := c.cfg.Driver.Init(ctx); err != nil {
|
|
return nil, fmt.Errorf("scriptcrawler: driver init: %w", err)
|
|
}
|
|
|
|
result := &CrawlResult{TargetNew: targetNew, StartedAt: time.Now()}
|
|
defer func() { result.FinishedAt = time.Now() }()
|
|
emit := func(p CrawlProgress) {
|
|
if c.cfg.OnProgress == nil {
|
|
return
|
|
}
|
|
p.TargetNew = result.TargetNew
|
|
p.TotalEntries = result.TotalEntries
|
|
p.NewVideos = result.NewVideos
|
|
p.Skipped = result.Skipped
|
|
p.Failed = result.Failed
|
|
p.SeenSnapshot = result.SeenSnapshot
|
|
c.cfg.OnProgress(p)
|
|
}
|
|
emit(CrawlProgress{})
|
|
|
|
crawlDir, err := filepath.Abs(c.cfg.Driver.CrawlDir())
|
|
if err != nil {
|
|
return result, fmt.Errorf("scriptcrawler: resolve crawl dir: %w", err)
|
|
}
|
|
if err := os.MkdirAll(crawlDir, 0o755); err != nil {
|
|
return result, err
|
|
}
|
|
runID := time.Now().UTC().Format("20060102T150405Z")
|
|
seenPath := filepath.Join(crawlDir, "seen-"+runID+".txt")
|
|
jobPath := filepath.Join(crawlDir, "job-"+runID+".json")
|
|
result.SeenFile = seenPath
|
|
result.JobFile = jobPath
|
|
|
|
seenCount, err := c.writeSeenSourceIDs(ctx, seenPath)
|
|
if err != nil {
|
|
return result, fmt.Errorf("scriptcrawler: build seen list: %w", err)
|
|
}
|
|
result.SeenSnapshot = seenCount
|
|
emit(CrawlProgress{})
|
|
|
|
if err := c.writeJobFile(jobPath, runID, targetNew, seenPath); err != nil {
|
|
return result, fmt.Errorf("scriptcrawler: write job: %w", err)
|
|
}
|
|
|
|
cmd, stdout, err := c.startScript(ctx, jobPath, targetNew)
|
|
if err != nil {
|
|
return result, fmt.Errorf("scriptcrawler: start: %w", err)
|
|
}
|
|
scanner := bufio.NewScanner(stdout)
|
|
scanner.Buffer(make([]byte, 64*1024), 4*1024*1024)
|
|
progress := CrawlProgress{}
|
|
for scanner.Scan() {
|
|
if err := ctx.Err(); err != nil {
|
|
_ = cmd.Process.Kill()
|
|
break
|
|
}
|
|
line := strings.TrimSpace(scanner.Text())
|
|
if line == "" {
|
|
continue
|
|
}
|
|
var event Event
|
|
if err := json.Unmarshal([]byte(line), &event); err != nil {
|
|
log.Printf("[scriptcrawler] drive=%s stdout parse: %v line=%q", c.cfg.Driver.ID(), err, line)
|
|
continue
|
|
}
|
|
eventType := strings.ToLower(strings.TrimSpace(event.Type))
|
|
item := event.normalizedItem()
|
|
if eventType == "" && item.hasPayload() {
|
|
eventType = "item"
|
|
}
|
|
switch eventType {
|
|
case "item":
|
|
result.TotalEntries++
|
|
progress.Emitted++
|
|
emit(progress)
|
|
if result.NewVideos >= targetNew {
|
|
_ = cmd.Process.Kill()
|
|
break
|
|
}
|
|
added, err := c.processItem(ctx, item)
|
|
if err != nil {
|
|
log.Printf("[scriptcrawler] drive=%s item failed source_id=%q title=%q: %v", c.cfg.Driver.ID(), item.SourceID, item.Title, err)
|
|
result.Failed++
|
|
} else if added {
|
|
result.NewVideos++
|
|
} else {
|
|
result.Skipped++
|
|
}
|
|
emit(progress)
|
|
if result.NewVideos >= targetNew {
|
|
_ = cmd.Process.Kill()
|
|
break
|
|
}
|
|
case "progress":
|
|
if event.Checked > 0 {
|
|
progress.Checked = event.Checked
|
|
}
|
|
if event.Emitted > 0 {
|
|
progress.Emitted = event.Emitted
|
|
}
|
|
progress.Message = event.Message
|
|
emit(progress)
|
|
case "done":
|
|
progress.Message = event.Message
|
|
emit(progress)
|
|
case "":
|
|
log.Printf("[scriptcrawler] drive=%s missing event type line=%q", c.cfg.Driver.ID(), line)
|
|
default:
|
|
log.Printf("[scriptcrawler] drive=%s unknown event type=%q", c.cfg.Driver.ID(), event.Type)
|
|
}
|
|
}
|
|
if err := scanner.Err(); err != nil {
|
|
log.Printf("[scriptcrawler] drive=%s stdout scan: %v", c.cfg.Driver.ID(), err)
|
|
}
|
|
if err := cmd.Wait(); err != nil && ctx.Err() == nil && result.NewVideos < targetNew {
|
|
log.Printf("[scriptcrawler] drive=%s script exit: %v", c.cfg.Driver.ID(), err)
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func (c *Crawler) writeSeenSourceIDs(ctx context.Context, path string) (int, error) {
|
|
seenIDs, err := c.cfg.Catalog.ListCrawlerSourceIDs(ctx, c.sourceKind(), c.cfg.Driver.ID())
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
seen := make(map[string]struct{}, len(seenIDs))
|
|
for _, id := range seenIDs {
|
|
id = strings.TrimSpace(id)
|
|
if id != "" {
|
|
seen[id] = struct{}{}
|
|
}
|
|
}
|
|
tmp := path + ".part"
|
|
f, err := os.OpenFile(tmp, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o644)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
for id := range seen {
|
|
if _, err := f.WriteString(id + "\n"); err != nil {
|
|
_ = f.Close()
|
|
_ = os.Remove(tmp)
|
|
return 0, err
|
|
}
|
|
}
|
|
if err := f.Close(); err != nil {
|
|
_ = os.Remove(tmp)
|
|
return 0, err
|
|
}
|
|
if err := os.Rename(tmp, path); err != nil {
|
|
_ = os.Remove(tmp)
|
|
return 0, err
|
|
}
|
|
return len(seen), nil
|
|
}
|
|
|
|
func (c *Crawler) writeJobFile(path, runID string, targetNew int, seenPath string) error {
|
|
cfg := json.RawMessage([]byte("{}"))
|
|
if raw := strings.TrimSpace(c.cfg.ConfigJSON); raw != "" {
|
|
if !json.Valid([]byte(raw)) {
|
|
return errors.New("config_json must be valid JSON")
|
|
}
|
|
cfg = json.RawMessage(raw)
|
|
}
|
|
outputDir, err := filepath.Abs(c.cfg.Driver.OutputDir())
|
|
if err != nil {
|
|
return fmt.Errorf("resolve output dir: %w", err)
|
|
}
|
|
job := Job{
|
|
Protocol: "crawler.v1",
|
|
Mode: "crawl",
|
|
RunID: runID,
|
|
CrawlerID: c.cfg.Driver.ID(),
|
|
TargetNew: targetNew,
|
|
SeenSourceIDsFile: seenPath,
|
|
OutputDir: outputDir,
|
|
Config: cfg,
|
|
Network: JobNetwork{ProxyURL: strings.TrimSpace(c.cfg.ProxyURL)},
|
|
}
|
|
data, err := json.MarshalIndent(job, "", " ")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
tmp := path + ".part"
|
|
if err := os.WriteFile(tmp, data, 0o600); err != nil {
|
|
return err
|
|
}
|
|
return os.Rename(tmp, path)
|
|
}
|
|
|
|
func (c *Crawler) startScript(ctx context.Context, jobPath string, targetNew int) (*exec.Cmd, io.ReadCloser, error) {
|
|
cmd := exec.CommandContext(ctx, c.cfg.PythonPath, c.cfg.ScriptPath, "--job", jobPath)
|
|
if strings.TrimSpace(c.cfg.WorkDir) != "" {
|
|
cmd.Dir = c.cfg.WorkDir
|
|
}
|
|
if proxyURL := strings.TrimSpace(c.cfg.ProxyURL); proxyURL != "" {
|
|
cmd.Env = append(os.Environ(),
|
|
"HTTP_PROXY="+proxyURL,
|
|
"HTTPS_PROXY="+proxyURL,
|
|
"http_proxy="+proxyURL,
|
|
"https_proxy="+proxyURL,
|
|
"NO_PROXY=",
|
|
"no_proxy=",
|
|
)
|
|
}
|
|
stdout, err := cmd.StdoutPipe()
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
stderr, err := cmd.StderrPipe()
|
|
if err != nil {
|
|
_ = stdout.Close()
|
|
return nil, nil, err
|
|
}
|
|
log.Printf("[scriptcrawler] drive=%s exec %s --job=%s target_new=%d", c.cfg.Driver.ID(), c.cfg.ScriptPath, jobPath, targetNew)
|
|
if err := cmd.Start(); err != nil {
|
|
_ = stdout.Close()
|
|
_ = stderr.Close()
|
|
return nil, nil, err
|
|
}
|
|
go forwardScriptLog(c.cfg.Driver.ID(), stderr)
|
|
return cmd, stdout, nil
|
|
}
|
|
|
|
func forwardScriptLog(driveID string, r io.Reader) {
|
|
scanner := bufio.NewScanner(r)
|
|
scanner.Buffer(make([]byte, 64*1024), 1024*1024)
|
|
for scanner.Scan() {
|
|
line := strings.TrimSpace(scanner.Text())
|
|
if line == "" {
|
|
continue
|
|
}
|
|
log.Printf("[scriptcrawler:script] drive=%s %s", driveID, line)
|
|
}
|
|
}
|
|
|
|
func (c *Crawler) processItem(ctx context.Context, item Item) (bool, error) {
|
|
item, sourceID, err := normalizeItemForImport(item)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
videoID := BuildVideoIDForKind(c.sourceKind(), c.cfg.Driver.ID(), sourceID)
|
|
if deleted, err := c.cfg.Catalog.IsVideoDeleted(ctx, videoID); err != nil {
|
|
return false, err
|
|
} else if deleted {
|
|
return false, nil
|
|
}
|
|
if existing, _ := c.cfg.Catalog.GetVideo(ctx, videoID); existing != nil {
|
|
return false, nil
|
|
}
|
|
videoExt := detectVideoExt(item.Media.URL, item.Media.LocalFile)
|
|
videoFile := sourceID + videoExt
|
|
videoPath, err := c.cfg.Driver.VideoPath(videoFile)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
size, err := c.materializeMedia(ctx, item.Media, videoPath, item.DetailURL, true)
|
|
if err != nil {
|
|
return false, fmt.Errorf("video: %w", err)
|
|
}
|
|
|
|
thumbReady := false
|
|
if item.Thumbnail.URL != "" || item.Thumbnail.LocalFile != "" {
|
|
thumbFile := sourceID + detectThumbExt(item.Thumbnail.URL, item.Thumbnail.LocalFile)
|
|
thumbPath, err := c.cfg.Driver.ThumbPath(thumbFile)
|
|
if err == nil {
|
|
if _, err := c.materializeMedia(ctx, item.Thumbnail, thumbPath, item.DetailURL, false); err != nil {
|
|
log.Printf("[scriptcrawler] drive=%s source_id=%s thumbnail failed: %v", c.cfg.Driver.ID(), sourceID, err)
|
|
} else if c.cfg.CommonThumbDir != "" {
|
|
if err := os.MkdirAll(c.cfg.CommonThumbDir, 0o755); err != nil {
|
|
log.Printf("[scriptcrawler] drive=%s common thumbs mkdir: %v", c.cfg.Driver.ID(), err)
|
|
} else if err := copyFileAtomic(thumbPath, mediaasset.ThumbnailPathInDir(c.cfg.CommonThumbDir, videoID)); err != nil {
|
|
log.Printf("[scriptcrawler] drive=%s source_id=%s copy thumbnail: %v", c.cfg.Driver.ID(), sourceID, err)
|
|
} else {
|
|
thumbReady = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
now := time.Now()
|
|
title := strings.TrimSpace(item.Title)
|
|
if title == "" {
|
|
title = sourceID
|
|
}
|
|
author := strings.TrimSpace(item.Author)
|
|
if author == "" {
|
|
author = c.cfg.Driver.ID()
|
|
}
|
|
tags := cleanStringList(item.Tags)
|
|
if matched, err := c.cfg.Catalog.MatchTags(ctx, title+" "+author+" "+strings.Join(tags, " ")); err == nil {
|
|
tags = mergeStringLists(tags, matched)
|
|
}
|
|
publishedAt := now
|
|
if parsed := parsePublishedAt(item.PublishedAt); !parsed.IsZero() {
|
|
publishedAt = parsed
|
|
}
|
|
quality := strings.TrimSpace(item.Quality)
|
|
if quality == "" {
|
|
quality = "HD"
|
|
}
|
|
v := &catalog.Video{
|
|
ID: videoID,
|
|
DriveID: c.cfg.Driver.ID(),
|
|
FileID: videoFile,
|
|
FileName: videoFile,
|
|
Title: title,
|
|
Author: author,
|
|
Tags: tags,
|
|
DurationSeconds: item.DurationSeconds,
|
|
Size: size,
|
|
Ext: strings.TrimPrefix(videoExt, "."),
|
|
Quality: quality,
|
|
Category: strings.TrimSpace(item.Category),
|
|
Description: strings.TrimSpace(item.Description),
|
|
PreviewStatus: "pending",
|
|
PublishedAt: publishedAt,
|
|
CreatedAt: now,
|
|
UpdatedAt: now,
|
|
}
|
|
if thumbReady {
|
|
v.ThumbnailURL = "/p/thumb/" + v.ID
|
|
}
|
|
if err := c.cfg.Catalog.UpsertVideo(ctx, v); err != nil {
|
|
_ = os.Remove(videoPath)
|
|
return false, err
|
|
}
|
|
log.Printf("[scriptcrawler] drive=%s source_id=%s ok title=%q size=%d", c.cfg.Driver.ID(), sourceID, title, size)
|
|
return true, nil
|
|
}
|
|
|
|
func (c *Crawler) materializeMedia(ctx context.Context, ref MediaRef, dst, referer string, required bool) (int64, error) {
|
|
if local := strings.TrimSpace(ref.LocalFile); local != "" {
|
|
return c.copyLocalOutput(local, dst)
|
|
}
|
|
if rawURL := strings.TrimSpace(ref.URL); rawURL != "" {
|
|
attemptCtx, cancel := c.downloadAttemptContext(ctx)
|
|
defer cancel()
|
|
return c.downloadAtomic(attemptCtx, ref, dst, referer)
|
|
}
|
|
if required {
|
|
return 0, errors.New("missing url or local_file")
|
|
}
|
|
return 0, nil
|
|
}
|
|
|
|
func (c *Crawler) copyLocalOutput(src, dst string) (int64, error) {
|
|
outputRoot, err := filepath.Abs(c.cfg.Driver.OutputDir())
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
srcAbs, err := filepath.Abs(src)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if srcAbs != outputRoot && !strings.HasPrefix(srcAbs, outputRoot+string(os.PathSeparator)) {
|
|
return 0, errors.New("local_file must be inside job output_dir")
|
|
}
|
|
info, err := os.Stat(srcAbs)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if info.IsDir() || info.Size() == 0 {
|
|
return 0, errors.New("local_file is empty or directory")
|
|
}
|
|
return info.Size(), copyFileAtomic(srcAbs, dst)
|
|
}
|
|
|
|
func (c *Crawler) downloadAttemptContext(ctx context.Context) (context.Context, context.CancelFunc) {
|
|
if c.cfg.DownloadTimeout <= 0 {
|
|
return ctx, func() {}
|
|
}
|
|
return context.WithTimeout(ctx, c.cfg.DownloadTimeout)
|
|
}
|
|
|
|
func (c *Crawler) downloadAtomic(ctx context.Context, ref MediaRef, dst, referer string) (int64, error) {
|
|
src := strings.TrimSpace(ref.URL)
|
|
if src == "" {
|
|
return 0, errors.New("empty url")
|
|
}
|
|
if _, err := url.Parse(src); err != nil {
|
|
return 0, fmt.Errorf("parse url: %w", err)
|
|
}
|
|
if err := os.MkdirAll(filepath.Dir(dst), 0o755); err != nil {
|
|
return 0, err
|
|
}
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, src, nil)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
req.Header.Set("User-Agent", defaultUserAgent)
|
|
if referer != "" {
|
|
req.Header.Set("Referer", referer)
|
|
}
|
|
for k, v := range ref.Headers {
|
|
k = strings.TrimSpace(k)
|
|
if k == "" {
|
|
continue
|
|
}
|
|
req.Header.Set(k, v)
|
|
}
|
|
resp, err := c.cfg.HTTPClient.Do(req)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
return 0, fmt.Errorf("http %d", resp.StatusCode)
|
|
}
|
|
tmp := dst + ".part"
|
|
out, err := os.OpenFile(tmp, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o644)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
written, copyErr := io.Copy(out, resp.Body)
|
|
closeErr := out.Close()
|
|
if copyErr != nil {
|
|
_ = os.Remove(tmp)
|
|
return 0, copyErr
|
|
}
|
|
if closeErr != nil {
|
|
_ = os.Remove(tmp)
|
|
return 0, closeErr
|
|
}
|
|
if written <= 0 {
|
|
_ = os.Remove(tmp)
|
|
return 0, errors.New("empty body")
|
|
}
|
|
if err := os.Rename(tmp, dst); err != nil {
|
|
_ = os.Remove(tmp)
|
|
return 0, err
|
|
}
|
|
return written, nil
|
|
}
|
|
|
|
func configureExplicitProxy(transport *http.Transport, raw string) error {
|
|
proxyURL := strings.TrimSpace(raw)
|
|
if proxyURL == "" {
|
|
return nil
|
|
}
|
|
u, err := url.Parse(proxyURL)
|
|
if err != nil || u.Scheme == "" || u.Host == "" {
|
|
return fmt.Errorf("invalid proxy URL")
|
|
}
|
|
switch strings.ToLower(u.Scheme) {
|
|
case "http", "https":
|
|
transport.Proxy = http.ProxyURL(u)
|
|
transport.DialContext = nil
|
|
return nil
|
|
case "socks5", "socks5h":
|
|
dialContext, err := socksProxyDialContext(u)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
transport.Proxy = nil
|
|
transport.DialContext = dialContext
|
|
return nil
|
|
default:
|
|
return fmt.Errorf("unsupported proxy scheme %q", u.Scheme)
|
|
}
|
|
}
|
|
|
|
func socksProxyDialContext(proxyURL *url.URL) (func(context.Context, string, string) (net.Conn, error), error) {
|
|
var auth *proxy.Auth
|
|
if proxyURL.User != nil {
|
|
username := proxyURL.User.Username()
|
|
password, _ := proxyURL.User.Password()
|
|
auth = &proxy.Auth{User: username, Password: password}
|
|
}
|
|
dialer, err := proxy.SOCKS5("tcp", proxyURL.Host, auth, &net.Dialer{Timeout: 60 * time.Second})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
remoteDNS := strings.EqualFold(proxyURL.Scheme, "socks5h")
|
|
return func(ctx context.Context, network, addr string) (net.Conn, error) {
|
|
target := addr
|
|
if !remoteDNS {
|
|
resolved, err := resolveSocksTarget(ctx, addr)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
target = resolved
|
|
}
|
|
if ctxDialer, ok := dialer.(proxy.ContextDialer); ok {
|
|
return ctxDialer.DialContext(ctx, network, target)
|
|
}
|
|
type result struct {
|
|
conn net.Conn
|
|
err error
|
|
}
|
|
ch := make(chan result, 1)
|
|
go func() {
|
|
conn, err := dialer.Dial(network, target)
|
|
ch <- result{conn: conn, err: err}
|
|
}()
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil, ctx.Err()
|
|
case res := <-ch:
|
|
return res.conn, res.err
|
|
}
|
|
}, nil
|
|
}
|
|
|
|
func resolveSocksTarget(ctx context.Context, addr string) (string, error) {
|
|
host, port, err := net.SplitHostPort(addr)
|
|
if err != nil || net.ParseIP(host) != nil {
|
|
return addr, nil
|
|
}
|
|
ips, err := net.DefaultResolver.LookupIPAddr(ctx, host)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
for _, addr := range ips {
|
|
if ip4 := addr.IP.To4(); ip4 != nil {
|
|
return net.JoinHostPort(ip4.String(), port), nil
|
|
}
|
|
}
|
|
if len(ips) > 0 && ips[0].IP != nil {
|
|
return net.JoinHostPort(ips[0].IP.String(), port), nil
|
|
}
|
|
return "", fmt.Errorf("resolve %s: no address", host)
|
|
}
|
|
|
|
func normalizeItemForImport(item Item) (Item, string, error) {
|
|
item.Title = strings.TrimSpace(item.Title)
|
|
if item.Title == "" {
|
|
return item, "", errors.New("title is required")
|
|
}
|
|
item.DetailURL = strings.TrimSpace(item.DetailURL)
|
|
item.Author = strings.TrimSpace(item.Author)
|
|
item.Category = strings.TrimSpace(item.Category)
|
|
item.Quality = strings.TrimSpace(item.Quality)
|
|
item.Description = strings.TrimSpace(item.Description)
|
|
item.PublishedAt = strings.TrimSpace(item.PublishedAt)
|
|
item.MediaURL = strings.TrimSpace(item.MediaURL)
|
|
item.MediaLocalFile = strings.TrimSpace(item.MediaLocalFile)
|
|
item.ThumbnailURL = strings.TrimSpace(item.ThumbnailURL)
|
|
item.ThumbnailLocalFile = strings.TrimSpace(item.ThumbnailLocalFile)
|
|
|
|
if strings.TrimSpace(item.Media.URL) == "" {
|
|
item.Media.URL = item.MediaURL
|
|
}
|
|
if strings.TrimSpace(item.Media.LocalFile) == "" {
|
|
item.Media.LocalFile = item.MediaLocalFile
|
|
}
|
|
if len(item.Media.Headers) == 0 {
|
|
if len(item.MediaHeaders) > 0 {
|
|
item.Media.Headers = item.MediaHeaders
|
|
} else if len(item.Headers) > 0 {
|
|
item.Media.Headers = item.Headers
|
|
}
|
|
}
|
|
if strings.TrimSpace(item.Thumbnail.URL) == "" {
|
|
item.Thumbnail.URL = item.ThumbnailURL
|
|
}
|
|
if strings.TrimSpace(item.Thumbnail.LocalFile) == "" {
|
|
item.Thumbnail.LocalFile = item.ThumbnailLocalFile
|
|
}
|
|
if len(item.Thumbnail.Headers) == 0 {
|
|
if len(item.ThumbnailHeaders) > 0 {
|
|
item.Thumbnail.Headers = item.ThumbnailHeaders
|
|
} else if len(item.Headers) > 0 {
|
|
item.Thumbnail.Headers = item.Headers
|
|
}
|
|
}
|
|
|
|
item.Media.URL = strings.TrimSpace(item.Media.URL)
|
|
item.Media.LocalFile = strings.TrimSpace(item.Media.LocalFile)
|
|
item.Thumbnail.URL = strings.TrimSpace(item.Thumbnail.URL)
|
|
item.Thumbnail.LocalFile = strings.TrimSpace(item.Thumbnail.LocalFile)
|
|
if item.Media.URL == "" && item.Media.LocalFile == "" {
|
|
return item, "", errors.New("media_url is required")
|
|
}
|
|
|
|
sourceID := normalizeSourceID(item.SourceID)
|
|
if sourceID == "" {
|
|
sourceID = generatedSourceID(item)
|
|
}
|
|
return item, sourceID, nil
|
|
}
|
|
|
|
func normalizeSourceID(raw string) string {
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" {
|
|
return ""
|
|
}
|
|
var b strings.Builder
|
|
lastUnderscore := false
|
|
for _, r := range raw {
|
|
allowed := (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') ||
|
|
r == '_' || r == '-' || r == '.'
|
|
if allowed {
|
|
b.WriteRune(r)
|
|
lastUnderscore = false
|
|
continue
|
|
}
|
|
if !lastUnderscore {
|
|
b.WriteByte('_')
|
|
lastUnderscore = true
|
|
}
|
|
}
|
|
id := strings.Trim(b.String(), "._-")
|
|
if len(id) > 160 {
|
|
id = strings.Trim(id[:160], "._-")
|
|
}
|
|
return id
|
|
}
|
|
|
|
func generatedSourceID(item Item) string {
|
|
signature := strings.Join([]string{
|
|
item.Title,
|
|
stableURLKey(item.DetailURL),
|
|
stableURLKey(item.Media.URL),
|
|
strings.TrimSpace(item.Media.LocalFile),
|
|
}, "\n")
|
|
sum := sha256.Sum256([]byte(signature))
|
|
return "auto-" + hex.EncodeToString(sum[:])[:24]
|
|
}
|
|
|
|
func stableURLKey(raw string) string {
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" {
|
|
return ""
|
|
}
|
|
u, err := url.Parse(raw)
|
|
if err != nil || u == nil || u.Scheme == "" || u.Host == "" {
|
|
return raw
|
|
}
|
|
u.Fragment = ""
|
|
if u.RawQuery != "" && strings.TrimSpace(u.Path) != "" && !strings.Contains(strings.ToLower(u.RawQuery), "viewkey=") {
|
|
u.RawQuery = ""
|
|
}
|
|
return u.String()
|
|
}
|
|
|
|
func (c *Crawler) sourceKind() string {
|
|
if c == nil {
|
|
return Kind
|
|
}
|
|
if v := strings.TrimSpace(c.cfg.SourceKind); v != "" {
|
|
return v
|
|
}
|
|
return Kind
|
|
}
|
|
|
|
func BuildVideoID(driveID, sourceID string) string {
|
|
return BuildVideoIDForKind(Kind, driveID, sourceID)
|
|
}
|
|
|
|
func BuildVideoIDForKind(kind, driveID, sourceID string) string {
|
|
if kind = strings.TrimSpace(kind); kind == "" {
|
|
kind = Kind
|
|
}
|
|
return kind + "-" + driveID + "-" + sourceID
|
|
}
|
|
|
|
func detectVideoExt(rawURL, localFile string) string {
|
|
if ext := mediaExt(localFile, true); ext != "" {
|
|
return ext
|
|
}
|
|
if ext := mediaExt(rawURL, true); ext != "" {
|
|
return ext
|
|
}
|
|
return ".mp4"
|
|
}
|
|
|
|
func detectThumbExt(rawURL, localFile string) string {
|
|
if ext := mediaExt(localFile, false); ext != "" {
|
|
return ext
|
|
}
|
|
if ext := mediaExt(rawURL, false); ext != "" {
|
|
return ext
|
|
}
|
|
return ".jpg"
|
|
}
|
|
|
|
func mediaExt(raw string, video bool) string {
|
|
if strings.TrimSpace(raw) == "" {
|
|
return ""
|
|
}
|
|
value := raw
|
|
if u, err := url.Parse(strings.TrimSpace(raw)); err == nil && u != nil && u.Path != "" {
|
|
value = u.Path
|
|
}
|
|
ext := strings.ToLower(path.Ext(value))
|
|
if video {
|
|
switch ext {
|
|
case ".mp4", ".webm", ".mkv", ".mov", ".m4v", ".flv", ".avi":
|
|
return ext
|
|
}
|
|
return ""
|
|
}
|
|
switch ext {
|
|
case ".jpg", ".jpeg", ".png", ".webp", ".gif":
|
|
return ext
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func parsePublishedAt(raw string) time.Time {
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" {
|
|
return time.Time{}
|
|
}
|
|
if ms, err := strconv.ParseInt(raw, 10, 64); err == nil {
|
|
if ms > 100000000000 {
|
|
return time.UnixMilli(ms)
|
|
}
|
|
return time.Unix(ms, 0)
|
|
}
|
|
for _, layout := range []string{time.RFC3339, "2006-01-02", "2006-01-02 15:04:05"} {
|
|
if t, err := time.Parse(layout, raw); err == nil {
|
|
return t
|
|
}
|
|
}
|
|
return time.Time{}
|
|
}
|
|
|
|
func cleanStringList(in []string) []string {
|
|
out := []string{}
|
|
seen := map[string]struct{}{}
|
|
for _, s := range in {
|
|
s = strings.TrimSpace(s)
|
|
if s == "" {
|
|
continue
|
|
}
|
|
key := strings.ToLower(s)
|
|
if _, ok := seen[key]; ok {
|
|
continue
|
|
}
|
|
seen[key] = struct{}{}
|
|
out = append(out, s)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func mergeStringLists(lists ...[]string) []string {
|
|
out := []string{}
|
|
seen := map[string]struct{}{}
|
|
for _, list := range lists {
|
|
for _, s := range list {
|
|
s = strings.TrimSpace(s)
|
|
if s == "" {
|
|
continue
|
|
}
|
|
key := strings.ToLower(s)
|
|
if _, ok := seen[key]; ok {
|
|
continue
|
|
}
|
|
seen[key] = struct{}{}
|
|
out = append(out, s)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func copyFileAtomic(src, dst string) error {
|
|
in, err := os.Open(src)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer in.Close()
|
|
if err := os.MkdirAll(filepath.Dir(dst), 0o755); err != nil {
|
|
return err
|
|
}
|
|
tmp := dst + ".part"
|
|
out, err := os.OpenFile(tmp, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o644)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if _, err := io.Copy(out, in); err != nil {
|
|
_ = out.Close()
|
|
_ = os.Remove(tmp)
|
|
return err
|
|
}
|
|
if err := out.Close(); err != nil {
|
|
_ = os.Remove(tmp)
|
|
return err
|
|
}
|
|
return os.Rename(tmp, dst)
|
|
}
|