mirror of
https://github.com/nianzhibai/91.git
synced 2026-06-15 00:44:30 +08:00
c1355385e1
Redesign crawler management around imported Python scripts instead of built-in crawler storage. Crawler scripts now declare CRAWLER_NAME, imports validate metadata, crawler IDs are generated internally, and deleted crawler scripts are detached without deleting already imported videos. Add backend support for file and URL script imports, dry-run testing, metadata parsing, safer job paths, original filename preservation, and crawler listing that ignores detached script records. Remove the legacy built-in Spider91 script path flow and hidden Python/config JSON fields from the crawler API. Rework the admin crawler page into an independent crawler console with script import, dry-run testing, status metrics, spider iconography, and simplified controls. Update docs, examples, installer checks, Docker/release packaging, and tests for the new protocol.
51 lines
1.3 KiB
Python
51 lines
1.3 KiB
Python
#!/usr/bin/env python3
|
|
import argparse
|
|
import json
|
|
import sys
|
|
|
|
CRAWLER_NAME = "Demo Crawler"
|
|
|
|
|
|
def load_seen(path):
|
|
try:
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
return {line.strip() for line in f if line.strip()}
|
|
except FileNotFoundError:
|
|
return set()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--job", required=True)
|
|
args = parser.parse_args()
|
|
|
|
with open(args.job, "r", encoding="utf-8") as f:
|
|
job = json.load(f)
|
|
|
|
seen = load_seen(job.get("seen_source_ids_file", ""))
|
|
source_id = "demo-video-1"
|
|
if source_id in seen:
|
|
print(json.dumps({"type": "done", "stats": {"emitted": 0}}), flush=True)
|
|
return
|
|
|
|
event = {
|
|
"type": "item",
|
|
"source_id": source_id,
|
|
"title": "Demo Video",
|
|
"media_url": "https://example.test/video/demo-video-1.mp4",
|
|
"thumbnail_url": "https://example.test/thumb/demo-video-1.jpg",
|
|
"headers": {
|
|
"Referer": "https://example.test/",
|
|
},
|
|
}
|
|
print(json.dumps(event, ensure_ascii=False), flush=True)
|
|
print(json.dumps({"type": "done", "stats": {"emitted": 1}}), flush=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except Exception as exc:
|
|
print(f"crawler failed: {exc}", file=sys.stderr, flush=True)
|
|
raise
|