Word文檔內容批量替換腳本

word_replace_v1.py

# -*- coding: utf-8 -*-
"""
Word文檔批量替換腳本

功能說明：
1. 遞歸處理當前目錄及所有子目錄中的Word文檔(.docx格式)
2. 將文檔中的指定文本替換為新文本，同時保留原有格式
3. 支持處理段落和表格中的文本

使用方法：
1. 基本使用：python word_replace.py --old "原文本" --new "新文本"
2. 指定目錄：python word_replace.py --dir "/path/to/docs" --old "原文本" --new "新文本"
3. 查看幫助：python word_replace.py --help

注意事項：
- 需要安裝python-docx庫：pip install python-docx
- 僅支持.docx格式，不支持舊的.doc格式
- 會直接修改原文件，建議先備份重要文檔
- 如果文本跨多個格式塊(run)，可能無法完全替換

示例：
python word_replace.py --old "公司A" --new "公司B"
python word_replace.py --dir "./項目文檔" --old "2023年" --new "2024年"
"""

import os
import argparse
from docx import Document


def replace_in_docx(file_path, old_text, new_text):
    """
    在單個Word文檔中替換文本，保留格式
    
    參數:
        file_path: Word文檔路徑
        old_text: 要替換的舊文本
        new_text: 替換后的新文本
    """
    doc = Document(file_path)
    
    # 替換段落中的文本
    for paragraph in doc.paragraphs:
        if old_text in paragraph.text:
            for run in paragraph.runs:
                if old_text in run.text:
                    run.text = run.text.replace(old_text, new_text)
    
    # 替換表格中的文本
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    if old_text in paragraph.text:
                        for run in paragraph.runs:
                            if old_text in run.text:
                                run.text = run.text.replace(old_text, new_text)
    
    doc.save(file_path)
    print(f'已處理: {file_path}')


def process_all_word_files(directory, old_text, new_text):
    """
    遞歸處理所有Word文檔
    
    參數:
        directory: 要處理的目錄路徑
        old_text: 要替換的舊文本
        new_text: 替換后的新文本
    """
    processed_count = 0
    error_count = 0
    
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.docx'):
                file_path = os.path.join(root, file)
                try:
                    replace_in_docx(file_path, old_text, new_text)
                    processed_count += 1
                except Exception as e:
                    error_count += 1
                    print(f'處理失敗 {file_path}: {e}')
    
    print(f'\n處理完成！成功處理 {processed_count} 個文檔，失敗 {error_count} 個文檔')


def main():
    """主函數"""
    parser = argparse.ArgumentParser(
        description='Word文檔批量替換腳本 - 遞歸替換所有Word文檔中的文本并保留格式',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
示例:
  %(prog)s --old "公司A" --new "公司B"
  %(prog)s --dir "./項目文檔" --old "2023年" --new "2024年"
  %(prog)s --dir "." --old "舊版本" --new "新版本" --dry-run

注意:
  - 需要安裝python-docx庫: pip install python-docx
  - 僅支持.docx格式，不支持舊的.doc格式
  - 默認會直接修改原文件，使用--dry-run參數可預覽而不實際修改
        '''
    )
    
    parser.add_argument('--dir', default='.', 
                       help='要處理的目錄路徑（默認當前目錄）')
    parser.add_argument('--old', required=True, 
                       help='要替換的舊文本（必填）')
    parser.add_argument('--new', required=True, 
                       help='替換后的新文本（必填）')
    parser.add_argument('--dry-run', action='store_true',
                       help='預覽模式，顯示將要修改的文件但不實際執行替換')
    
    args = parser.parse_args()
    
    # 檢查目錄是否存在
    if not os.path.exists(args.dir):
        print(f"錯誤：目錄 '{args.dir}' 不存在")
        return
    
    # 如果是預覽模式，只顯示將要處理的文件
    if args.dry_run:
        print("預覽模式（不會實際修改文件）:")
        docx_files = []
        for root, dirs, files in os.walk(args.dir):
            for file in files:
                if file.endswith('.docx'):
                    file_path = os.path.join(root, file)
                    docx_files.append(file_path)
                    print(f"  {file_path}")
        
        if docx_files:
            print(f"\n找到 {len(docx_files)} 個Word文檔，將替換文本: '{args.old}' -> '{args.new}'")
        else:
            print("未找到任何Word文檔(.docx)")
        return
    
    # 執行實際替換操作
    print(f"開始處理目錄: {args.dir}")
    print(f"替換文本: '{args.old}' -> '{args.new}'")
    process_all_word_files(args.dir, args.old, args.new)


if __name__ == '__main__':
    main()

word_replace_v2.py

#!/usr/bin/env python3
"""
Word文檔批量替換腳本 - 增強版

功能說明：
1. 遞歸處理當前目錄及所有子目錄中的Word文檔(.docx格式)
2. 將文檔中的指定文本替換為新文本，同時保留原有格式
3. 支持處理段落和表格中的文本
4. 增強功能：能夠處理跨多個格式塊(run)的文本替換

使用方法：
1. 基本使用：python word_replace.py --old "原文本" --new "新文本"
2. 指定目錄：python word_replace.py --dir "/path/to/docs" --old "原文本" --new "新文本"
3. 查看幫助：python word_replace.py --help

注意事項：
- 需要安裝python-docx庫：pip install python-docx
- 僅支持.docx格式，不支持舊的.doc格式
- 會直接修改原文件，建議先備份重要文檔
- 增強版能夠處理跨多個格式塊的文本替換

示例：
python word_replace.py --old "公司A" --new "公司B"
python word_replace.py --dir "./項目文檔" --old "2023年" --new "2024年"
"""

import os
import argparse
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_COLOR_INDEX


def find_text_in_runs(paragraph, old_text):
    """
    在段落的所有run中查找文本，處理跨多個run的情況
    
    參數:
        paragraph: 段落對象
        old_text: 要查找的文本
        
    返回:
        包含匹配信息的字典，或None如果未找到
    """
    full_text = paragraph.text
    if old_text not in full_text:
        return None
    
    # 查找文本在完整段落中的位置
    start_idx = full_text.find(old_text)
    end_idx = start_idx + len(old_text)
    
    # 確定哪些run包含了目標文本
    run_start_idx = 0
    matching_runs = []
    
    for run in paragraph.runs:
        run_end_idx = run_start_idx + len(run.text)
        
        # 檢查這個run是否包含目標文本的一部分
        if run_start_idx <= start_idx < run_end_idx or \
           run_start_idx < end_idx <= run_end_idx or \
           (start_idx <= run_start_idx and run_end_idx <= end_idx):
            matching_runs.append({
                'run': run,
                'run_start': run_start_idx,
                'run_end': run_end_idx,
                'text_start': max(start_idx, run_start_idx),
                'text_end': min(end_idx, run_end_idx)
            })
        
        run_start_idx = run_end_idx
    
    if not matching_runs:
        return None
    
    return {
        'matching_runs': matching_runs,
        'text_start': start_idx,
        'text_end': end_idx,
        'old_text': old_text
    }


def replace_text_in_paragraph(paragraph, old_text, new_text):
    """
    替換段落中的文本，處理跨多個run的情況
    
    參數:
        paragraph: 段落對象
        old_text: 要替換的舊文本
        new_text: 替換后的新文本
    """
    match_info = find_text_in_runs(paragraph, old_text)
    if not match_info:
        return False
    
    # 如果文本只在一個run中，直接替換
    if len(match_info['matching_runs']) == 1:
        run_info = match_info['matching_runs'][0]
        run = run_info['run']
        
        # 計算文本在run中的位置
        run_local_start = run_info['text_start'] - run_info['run_start']
        run_local_end = run_info['text_end'] - run_info['run_start']
        
        # 替換文本
        run.text = run.text[:run_local_start] + new_text + run.text[run_local_end:]
        return True
    
    # 處理跨多個run的文本替換
    # 策略：保留第一個run的格式，將替換后的文本放入第一個run，刪除其他run中的相關內容
    
    # 獲取第一個匹配的run
    first_run_info = match_info['matching_runs'][0]
    first_run = first_run_info['run']
    
    # 計算在第一個run中的文本位置
    first_run_local_start = first_run_info['text_start'] - first_run_info['run_start']
    
    # 構建新文本
    # 第一個run中目標文本之前的部分 + 新文本
    new_run_text = first_run.text[:first_run_local_start] + new_text
    
    # 更新第一個run的文本
    first_run.text = new_run_text
    
    # 處理后續的run：刪除它們中包含的目標文本部分
    for i, run_info in enumerate(match_info['matching_runs']):
        if i == 0:  # 第一個run已經處理過
            continue
            
        run = run_info['run']
        run_local_start = run_info['text_start'] - run_info['run_start']
        run_local_end = run_info['text_end'] - run_info['run_start']
        
        # 如果這個run只包含目標文本的一部分，刪除這部分
        if run_local_end < len(run.text):
            run.text = run.text[run_local_end:]
        else:
            # 如果整個run都是目標文本的一部分，清空它
            run.text = ""
    
    return True


def replace_in_docx(file_path, old_text, new_text):
    """
    在單個Word文檔中替換文本，處理跨多個格式塊的情況
    
    參數:
        file_path: Word文檔路徑
        old_text: 要替換的舊文本
        new_text: 替換后的新文本
    """
    doc = Document(file_path)
    replaced_count = 0
    
    # 替換段落中的文本
    for paragraph in doc.paragraphs:
        if old_text in paragraph.text:
            if replace_text_in_paragraph(paragraph, old_text, new_text):
                replaced_count += 1
    
    # 替換表格中的文本
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    if old_text in paragraph.text:
                        if replace_text_in_paragraph(paragraph, old_text, new_text):
                            replaced_count += 1
    
    if replaced_count > 0:
        doc.save(file_path)
        print(f'已處理: {file_path} (替換了 {replaced_count} 處)')
    else:
        print(f'跳過: {file_path} (未找到匹配文本)')


def process_all_word_files(directory, old_text, new_text):
    """
    遞歸處理所有Word文檔
    
    參數:
        directory: 要處理的目錄路徑
        old_text: 要替換的舊文本
        new_text: 替換后的新文本
    """
    processed_count = 0
    error_count = 0
    
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.docx'):
                file_path = os.path.join(root, file)
                try:
                    replace_in_docx(file_path, old_text, new_text)
                    processed_count += 1
                except Exception as e:
                    error_count += 1
                    print(f'處理失敗 {file_path}: {e}')
    
    print(f'\n處理完成！成功處理 {processed_count} 個文檔，失敗 {error_count} 個文檔')


def main():
    """主函數"""
    parser = argparse.ArgumentParser(
        description='Word文檔批量替換腳本 - 增強版，支持跨格式塊文本替換',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
示例:
  %(prog)s --old "公司A" --new "公司B"
  %(prog)s --dir "./項目文檔" --old "2023年" --new "2024年"
  %(prog)s --dir "." --old "舊版本" --new "新版本" --dry-run

注意:
  - 需要安裝python-docx庫: pip install python-docx
  - 僅支持.docx格式，不支持舊的.doc格式
  - 默認會直接修改原文件，使用--dry-run參數可預覽而不實際修改
  - 增強版能夠處理跨多個格式塊的文本替換
        '''
    )
    
    parser.add_argument('--dir', default='.', 
                       help='要處理的目錄路徑（默認當前目錄）')
    parser.add_argument('--old', required=True, 
                       help='要替換的舊文本（必填）')
    parser.add_argument('--new', required=True, 
                       help='替換后的新文本（必填）')
    parser.add_argument('--dry-run', action='store_true',
                       help='預覽模式，顯示將要修改的文件但不實際執行替換')
    
    args = parser.parse_args()
    
    # 檢查目錄是否存在
    if not os.path.exists(args.dir):
        print(f"錯誤：目錄 '{args.dir}' 不存在")
        return
    
    # 如果是預覽模式，只顯示將要處理的文件
    if args.dry_run:
        print("預覽模式（不會實際修改文件）:")
        docx_files = []
        for root, dirs, files in os.walk(args.dir):
            for file in files:
                if file.endswith('.docx'):
                    file_path = os.path.join(root, file)
                    docx_files.append(file_path)
                    print(f"  {file_path}")
        
        if docx_files:
            print(f"\n找到 {len(docx_files)} 個Word文檔，將替換文本: '{args.old}' -> '{args.new}'")
        else:
            print("未找到任何Word文檔(.docx)")
        return
    
    # 執行實際替換操作
    print(f"開始處理目錄: {args.dir}")
    print(f"替換文本: '{args.old}' -> '{args.new}'")
    print("增強模式：支持跨多個格式塊的文本替換")
    process_all_word_files(args.dir, args.old, args.new)


if __name__ == '__main__':
    main()

posted @ 2025-09-23 11:26 wanghongwei-dev 閱讀(59) 評論(0) 收藏舉報

刷新頁面返回頂部

http://www.rzrgm.cn/wanghongwei-dev

歡迎來到本博客，本博客多為部署和配置文檔用于查詢使用，并用于記錄故障處理。

Word文檔內容批量替換腳本

公告