爬取博客園 collection 下的文章

? 完整代碼（保存為 `spider.py`）

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import time
import requests
import html2text
from bs4 import BeautifulSoup
from tqdm import tqdm
from urllib.parse import urljoin, urlparse

BASE_URL = "http://www.rzrgm.cn/xxx/collections/11111"
HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/123 Safari/537.36")
}
SAVE_DIR = "blog"
IMG_DIR = os.path.join(SAVE_DIR, "images")
os.makedirs(IMG_DIR, exist_ok=True)

h = html2text.HTML2Text()
h.ignore_links = False
h.wrap_links = False
h.mark_code = True

# ------------------ utils ------------------
def safe_name(title: str) -> str:
    return re.sub(r'[\\/:*?"<>|]', '_', title) or "untitled"

def fetch(url: str) -> str:
    resp = requests.get(url, headers=HEADERS, timeout=15)
    resp.raise_for_status()
    resp.encoding = resp.apparent_encoding
    return resp.text

def download_img(src: str) -> str:
    if src.startswith("http://"):
        src = "https:" + src
    fname = os.path.basename(urlparse(src).path) or "img"
    local_path = os.path.join(IMG_DIR, fname)
    if not os.path.exists(local_path):
        try:
            data = requests.get(src, headers=HEADERS, timeout=15).content
            with open(local_path, "wb") as f:
                f.write(data)
        except Exception as e:
            print(f"?? 圖片下載失敗: {src} ({e})")
            return src  # fallback
    return os.path.relpath(local_path, SAVE_DIR)

# ------------------ main logic ------------------
def collect_links():
    html = fetch(BASE_URL)
    soup = BeautifulSoup(html, "lxml")
    links = []
    for idx, a in enumerate(soup.select("a.entrylistItemTitle"), 1):
        title = a.get_text(strip=True)
        href = a["href"]
        if not href.startswith("http"):
            href = urljoin(BASE_URL, href)
        links.append((f"{idx:03d}", title, href))
    return links

def post2md(prefix: str, title: str, url: str):
    html = fetch(url)
    soup = BeautifulSoup(html, "lxml")
    body = soup.find("div", id="cnblogs_post_body") or soup

    # 本地化圖片
    for img in body.select("img"):
        src = img.get("src")
        if src:
            img["src"] = download_img(src)

    md_body = h.handle(str(body))
    meta = f"原文：<{url}>\n\n---\n\n"
    filename = f"{prefix}_{safe_name(title)}.md"
    with open(os.path.join(SAVE_DIR, filename), "w", encoding="utf-8") as f:
        f.write(meta + md_body)

def main():
    links = collect_links()
    print(f"?? 共發現 {len(links)} 篇文章")
    for prefix, title, url in tqdm(links, desc="?? 下載"):
        outfile = os.path.join(SAVE_DIR, f"{prefix}_{safe_name(title)}.md")
        if os.path.exists(outfile):
            continue
        try:
            post2md(prefix, title, url)
            time.sleep(1)
        except Exception as e:
            tqdm.write(f"? 失敗 {url} - {e}")
    print(f"? 全部完成！文件保存在 → {os.path.abspath(SAVE_DIR)}")

if __name__ == "__main__":
    main()

? 使用方法

安裝依賴：

pip install requests beautifulsoup4 html2text tqdm

運行腳本：
```
python spider.py
```

? 輸出示例

blog/
├── 001_ aaa.md
├── 002_ bbb.md
├── 003_ ccc.md
└── images/
    ├── kkksc01.png
    ├── kkksc02.png
    └── kkksc03.png

本文來自博客園，作者：愛玩游戲的jason，轉載請注明原文鏈接：http://www.rzrgm.cn/jason-play/p/18987909

歡迎您與我聯系:郵箱也可以加我微信:13818403352

posted @ 2025-07-16 16:55 愛玩游戲的jason 閱讀(8) 評論(0) 收藏舉報

刷新頁面返回頂部

jason-play

by 程序猿jason | 玩物喪稚 | 五味榴蓮 | liuqiyuan_jason

爬取博客園 collection 下的文章

? 完整代碼（保存為 `spider.py`）

? 使用方法

? 輸出示例

公告

jason-play

by 程序猿jason | 玩物喪稚 | 五味榴蓮 | liuqiyuan_jason

爬取博客園 collection 下的文章

? 完整代碼（保存為 spider.py）

? 使用方法

? 輸出示例

公告

? 完整代碼（保存為 `spider.py`）