Files
91/examples/crawlers/simple_crawler.py
T
nianzhibai c1355385e1 feat(crawler): simplify script crawler workflow
Redesign crawler management around imported Python scripts instead of built-in crawler storage. Crawler scripts now declare CRAWLER_NAME, imports validate metadata, crawler IDs are generated internally, and deleted crawler scripts are detached without deleting already imported videos.

Add backend support for file and URL script imports, dry-run testing, metadata parsing, safer job paths, original filename preservation, and crawler listing that ignores detached script records. Remove the legacy built-in Spider91 script path flow and hidden Python/config JSON fields from the crawler API.

Rework the admin crawler page into an independent crawler console with script import, dry-run testing, status metrics, spider iconography, and simplified controls. Update docs, examples, installer checks, Docker/release packaging, and tests for the new protocol.
2026-06-10 14:27:16 +08:00

51 lines
1.3 KiB
Python

#!/usr/bin/env python3
import argparse
import json
import sys
CRAWLER_NAME = "Demo Crawler"
def load_seen(path):
try:
with open(path, "r", encoding="utf-8") as f:
return {line.strip() for line in f if line.strip()}
except FileNotFoundError:
return set()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--job", required=True)
args = parser.parse_args()
with open(args.job, "r", encoding="utf-8") as f:
job = json.load(f)
seen = load_seen(job.get("seen_source_ids_file", ""))
source_id = "demo-video-1"
if source_id in seen:
print(json.dumps({"type": "done", "stats": {"emitted": 0}}), flush=True)
return
event = {
"type": "item",
"source_id": source_id,
"title": "Demo Video",
"media_url": "https://example.test/video/demo-video-1.mp4",
"thumbnail_url": "https://example.test/thumb/demo-video-1.jpg",
"headers": {
"Referer": "https://example.test/",
},
}
print(json.dumps(event, ensure_ascii=False), flush=True)
print(json.dumps({"type": "done", "stats": {"emitted": 1}}), flush=True)
if __name__ == "__main__":
try:
main()
except Exception as exc:
print(f"crawler failed: {exc}", file=sys.stderr, flush=True)
raise