jieba分詞
import jieba
from collections import Counter
自定義合并規則(根據聊齋內容,可補充更多人物別稱映射)
merge_map = {
# 假設聊齋中有類似 “寧采臣” 和 “寧公子” 是同一人物,可自行擴展
"寧公子": "寧采臣",
"小倩": "聶小倩"
# 可繼續添加其他人物不同說法的映射
}
讀取聊齋文本內容(聊齋文本文件路徑編碼為 utf-8 )
with open("liao_zhai.txt", "r", encoding="utf-8") as f:
text = f.read()
使用 jieba 分詞
words = jieba.lcut(text)
合并同一人物不同說法
merged_words = []
for word in words:
# 檢查是否在合并映射中,若在則替換,否則保留原詞
merged_words.append(merge_map.get(word, word))
統計詞頻,取前 20 個
word_count = Counter(merged_words)
top_20 = word_count.most_common(20)
輸出結果
for word, count in top_20:
print(f"{word}: {count}")

浙公網安備 33010602011771號