爬取博客園 collection 下的文章
? 完整代碼(保存為 spider.py)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import time
import requests
import html2text
from bs4 import BeautifulSoup
from tqdm import tqdm
from urllib.parse import urljoin, urlparse
BASE_URL = "http://www.rzrgm.cn/xxx/collections/11111"
HEADERS = {
"User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123 Safari/537.36")
}
SAVE_DIR = "blog"
IMG_DIR = os.path.join(SAVE_DIR, "images")
os.makedirs(IMG_DIR, exist_ok=True)
h = html2text.HTML2Text()
h.ignore_links = False
h.wrap_links = False
h.mark_code = True
# ------------------ utils ------------------
def safe_name(title: str) -> str:
return re.sub(r'[\\/:*?"<>|]', '_', title) or "untitled"
def fetch(url: str) -> str:
resp = requests.get(url, headers=HEADERS, timeout=15)
resp.raise_for_status()
resp.encoding = resp.apparent_encoding
return resp.text
def download_img(src: str) -> str:
if src.startswith("http://"):
src = "https:" + src
fname = os.path.basename(urlparse(src).path) or "img"
local_path = os.path.join(IMG_DIR, fname)
if not os.path.exists(local_path):
try:
data = requests.get(src, headers=HEADERS, timeout=15).content
with open(local_path, "wb") as f:
f.write(data)
except Exception as e:
print(f"?? 圖片下載失敗: {src} ({e})")
return src # fallback
return os.path.relpath(local_path, SAVE_DIR)
# ------------------ main logic ------------------
def collect_links():
html = fetch(BASE_URL)
soup = BeautifulSoup(html, "lxml")
links = []
for idx, a in enumerate(soup.select("a.entrylistItemTitle"), 1):
title = a.get_text(strip=True)
href = a["href"]
if not href.startswith("http"):
href = urljoin(BASE_URL, href)
links.append((f"{idx:03d}", title, href))
return links
def post2md(prefix: str, title: str, url: str):
html = fetch(url)
soup = BeautifulSoup(html, "lxml")
body = soup.find("div", id="cnblogs_post_body") or soup
# 本地化圖片
for img in body.select("img"):
src = img.get("src")
if src:
img["src"] = download_img(src)
md_body = h.handle(str(body))
meta = f"原文:<{url}>\n\n---\n\n"
filename = f"{prefix}_{safe_name(title)}.md"
with open(os.path.join(SAVE_DIR, filename), "w", encoding="utf-8") as f:
f.write(meta + md_body)
def main():
links = collect_links()
print(f"?? 共發現 {len(links)} 篇文章")
for prefix, title, url in tqdm(links, desc="?? 下載"):
outfile = os.path.join(SAVE_DIR, f"{prefix}_{safe_name(title)}.md")
if os.path.exists(outfile):
continue
try:
post2md(prefix, title, url)
time.sleep(1)
except Exception as e:
tqdm.write(f"? 失敗 {url} - {e}")
print(f"? 全部完成!文件保存在 → {os.path.abspath(SAVE_DIR)}")
if __name__ == "__main__":
main()
? 使用方法
-
安裝依賴:
pip install requests beautifulsoup4 html2text tqdm -
運行腳本:
python spider.py
? 輸出示例
blog/
├── 001_ aaa.md
├── 002_ bbb.md
├── 003_ ccc.md
└── images/
├── kkksc01.png
├── kkksc02.png
└── kkksc03.png
本文來自博客園,作者:愛玩游戲的jason,轉載請注明原文鏈接:http://www.rzrgm.cn/jason-play/p/18987909
歡迎您與我聯系:郵箱 也可以加我微信:13818403352

浙公網安備 33010602011771號