Python腳本,它將遍歷指定目錄下的所有.srt文件,移除其中的不必要的英文字符、不必要的空行以及不必要的空格。該腳本會保留字幕索引、字幕時間線以及字幕中的中文內容,并且只保留字幕中的中文內容。它還會保留字幕行與字幕之間的換行符,同時去掉字幕與字幕之間的不必要的換行符。處理后的內容將被保存為新的.srt文件。
代碼:
import os import re def is_index_line(line): # 檢查是否為索引行,通常索引行以數字開頭 return bool(re.match(r'^\d+$', line)) def is_time_line(line): # 檢查是否為時間線,包含"-->"的行 return bool(re.search(r'-->', line)) def clean_srt_file(file_path): with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines() new_content = [] subtitle_block = [] in_subtitle = False for line in lines: # 清除空白行 if not line.strip(): if subtitle_block: # 如果當前有字幕塊,則添加一個換行符 new_content.append(line) in_subtitle = False # 重置字幕塊狀態 continue if is_index_line(line) or is_time_line(line): # 如果當前行為索引或時間線,則保存之前的字幕塊(如果存在) if subtitle_block: new_content.extend(subtitle_block) new_content.append('') # 保留字幕塊之間的換行符 subtitle_block = [] # 添加索引行和時間線 new_content.append(line) in_subtitle = True elif in_subtitle: # 僅保留字幕中的中文內容 chinese_only_line = re.sub(r'[^\u4e00-\u9fa5\n]', '', line) if chinese_only_line.strip(): # 如果行內有中文字符 subtitle_block.append(chinese_only_line) # 添加最后一個字幕塊 if subtitle_block: new_content.extend(subtitle_block) new_content.append('') # 在文件末尾添加一個換行符 # 保存為新的srt文件 new_file_path = file_path.replace('.srt', '_cleaned.srt') with open(new_file_path, 'w', encoding='utf-8') as new_file: new_file.writelines(new_content) print(f'Processed and saved cleaned file to: {new_file_path}') def process_directory(directory_path): for filename in os.listdir(directory_path): if filename.endswith('.srt'): file_path = os.path.join(directory_path, filename) clean_srt_file(file_path) # 指定目錄路徑 directory_path = r'C:\caijian\29-51' process_directory(directory_path)
代碼2(改進):
import os import re def clean_and_save_srt(file_path): with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines() new_content = [] in_subtitle_block = False subtitle_block = [] for line in lines: line = line.strip() if is_index_line(line) or is_time_line(line): if subtitle_block: new_content.extend(subtitle_block) new_content.append('') # 保留字幕塊之間的換行符 subtitle_block = [] new_content.append(line) in_subtitle_block = True elif in_subtitle_block: if line: # 非空行 cleaned_line = re.sub(r'[^\S\r\n]+', '', line) # 去除多余的空格 cleaned_line = re.sub(r'[^\u4e00-\u9fa5,。?!、《》()【】:",。?!]', '', cleaned_line) # 只保留中文和部分標點 if cleaned_line: subtitle_block.append(cleaned_line) else: # 空行 if subtitle_block: # 如果字幕塊不為空,則結束當前字幕塊 new_content.extend(subtitle_block) new_content.append('') # 保留字幕塊之間的換行符 subtitle_block = [] in_subtitle_block = False if subtitle_block: # 添加最后一個字幕塊 new_content.extend(subtitle_block) new_file_path = file_path.replace('.srt', '_cleaned.srt') with open(new_file_path, 'w', encoding='utf-8') as new_file: new_file.write('\n'.join(new_content)) print(f'Processed and saved cleaned file to: {new_file_path}') def is_index_line(line): return bool(re.match(r'^\d+$', line)) def is_time_line(line): return bool(re.search(r'-->', line)) def process_directory(directory_path): for filename in os.listdir(directory_path): if filename.endswith('.srt'): file_path = os.path.join(directory_path, filename) clean_and_save_srt(file_path) # 指定目錄路徑 directory_path = r'C:\caijian\29-51' process_directory(directory_path)
浙公網安備 33010602011771號