python2 | python3 | 文本清洗正則匹配
python3寫(xiě)的清洗文本代碼在python2用不了,會(huì)出現(xiàn)各種編碼問(wèn)題,經(jīng)過(guò)痛苦的一晚上加班終于搞完了,記錄一下。
python2
def clean_text(content):
"""去除話(huà)題詞,鏈接,@用戶(hù),圖標(biāo),emoji,標(biāo)點(diǎn)符號(hào),空白符"""
query = re.sub(u"#[^#]*?#|<sina.*?>|@[^ ]*", "", content).replace("\xe2\x80\x8b", "")
#去除http鏈接
try:
URL_REGEX = re.compile(
u'(?i)http[s]?://(?:[a-zA-Z]|[0-9]|[#$%*-;=?&@~.&+]|[!*,])+',
re.IGNORECASE)
query = re.sub(URL_REGEX, "", query)
except:
# sometimes lead to "catastrophic backtracking"
zh_puncts1 = u",;、。???()《》【】"
URL_REGEX = re.compile(u'(?i)((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>' + zh_puncts1 + u']+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>???“”‘’' + zh_puncts1 + u']))',
re.IGNORECASE)
query = re.sub(URL_REGEX, "", query.decode('utf8'))
#去除圖標(biāo) 表情
query = re.sub(u"\[\S+?\]", "", query.decode('utf8'))
# 去除真,圖標(biāo)式emoji
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\u2600-\u2B55" u"\U00010000-\U0010ffff"
"]+", flags=re.UNICODE)
query = emoji_pattern.sub('', query.decode('utf8'))
#去除標(biāo)點(diǎn)符號(hào)
allpuncs = re.compile(u"[~,\_《?!?、?;:‘’"“”【「】」·!@¥…()—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+]")
query = re.sub(allpuncs, "", query.decode('utf8'))
#去除空白符
query = re.sub(u"(\s)+", "", query)
return query
python3
點(diǎn)擊查看代碼
def clean_text(text, remove_url=True, email=True, weibo_at=True, stop_terms=("轉(zhuǎn)發(fā)",),
emoji=True, weibo_topic=False, deduplicate_space=True,
norm_url=False, norm_html=False, to_url=False,
remove_puncts=False, remove_tags=True, t2s=False,
expression_len=(1,6), linesep2space=False):
'''
進(jìn)行各種文本清洗操作,特殊格式,網(wǎng)址,email,html代碼,等等
:param text: 輸入文本
:param remove_url: (默認(rèn)使用)是否去除網(wǎng)址
:param email: (默認(rèn)使用)是否去除email
:param weibo_at: (默認(rèn)使用)是否去除\@相關(guān)文本
:param stop_terms: 去除文本中的一些特定詞語(yǔ),默認(rèn)參數(shù)為("轉(zhuǎn)發(fā)",)
:param emoji: (默認(rèn)使用)去除\[\]包圍的文本,一般是表情符號(hào)
:param weibo_topic: (默認(rèn)不使用)去除##包圍的文本,一般是話(huà)題
:param deduplicate_space: (默認(rèn)使用)合并文本中間的多個(gè)空格為一個(gè)
:param norm_url: (默認(rèn)不使用)還原URL中的特殊字符為普通格式,如(%20轉(zhuǎn)為空格)
:param norm_html: (默認(rèn)不使用)還原HTML中的特殊字符為普通格式,如(\ 轉(zhuǎn)為空格)
:param to_url: (默認(rèn)不使用)將普通格式的字符轉(zhuǎn)為還原URL中的特殊字符,用于請(qǐng)求,如(空格轉(zhuǎn)為%20)
:param remove_puncts: (默認(rèn)不使用)移除所有標(biāo)點(diǎn)符號(hào)
:param remove_tags: (默認(rèn)使用)移除所有html塊
:param t2s: (默認(rèn)不使用)繁體字轉(zhuǎn)中文
:param expression_len: 假設(shè)表情的表情長(zhǎng)度范圍,不在范圍內(nèi)的文本認(rèn)為不是表情,不加以清洗,如[加上特別番外蕎麥花開(kāi)時(shí)共五冊(cè)]。設(shè)置為None則沒(méi)有限制
:param linesep2space: (默認(rèn)不使用)把換行符轉(zhuǎn)換成空格
:return: 清洗后的文本
'''
# unicode不可見(jiàn)字符
# 未轉(zhuǎn)義
text = re.sub(r"[\u200b-\u200d]", "", text)
# 已轉(zhuǎn)義
text = re.sub(r"(\\u200b|\\u200c|\\u200d)", "", text)
# 反向的矛盾設(shè)置
if norm_url and to_url:
raise Exception("norm_url和to_url是矛盾的設(shè)置")
if norm_html:
text = html.unescape(text)
#if to_url:
# text = urllib.parse.quote(text)
# if remove_tags:
# text = w3lib.html.remove_tags(text)
if remove_url:
try:
sina_url = re.compile(r'<sina.*?>')
text = re.sub(sina_url, "", text)
URL_REGEX = re.compile(
r'(?i)http[s]?://(?:[a-zA-Z]|[0-9]|[#$%*-;=?&@~.&+]|[!*,])+',
re.IGNORECASE)
text = re.sub(URL_REGEX, "", text)
except:
# sometimes lead to "catastrophic backtracking"
zh_puncts1 = ",;、。!?()《》【】"
URL_REGEX = re.compile(
r'(?i)((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>' + zh_puncts1 + ']+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>???“”‘’' + zh_puncts1 + ']))',
re.IGNORECASE)
text = re.sub(URL_REGEX, "", text)
if norm_url:
text = urllib.parse.unquote(text)
if email:
EMAIL_REGEX = re.compile(r"[-a-z0-9_.]+@(?:[-a-z0-9]+\.)+[a-z]{2,6}", re.IGNORECASE)
text = re.sub(EMAIL_REGEX, "", text)
if weibo_at:
text = re.sub(r"(回復(fù))?(//)?\s*@\S*?\s*(:|:| |$)", " ", text) # 去除正文中的@和回復(fù)/轉(zhuǎn)發(fā)中的用戶(hù)名
if emoji:
# 去除括號(hào)包圍的表情符號(hào)
# ? lazy match避免把兩個(gè)表情中間的部分去除掉
if type(expression_len) in {tuple, list} and len(expression_len) == 2:
# 設(shè)置長(zhǎng)度范圍避免誤傷人用的中括號(hào)內(nèi)容,如[加上特別番外蕎麥花開(kāi)時(shí)共五冊(cè)]
lb, rb = expression_len
text = re.sub(r"\[\S{"+str(lb)+r","+str(rb)+r"}?\]", "", text)
else:
text = re.sub(r"\[\S+?\]", "", text)
# text = re.sub(r"\[\S+\]", "", text)
# 去除真,圖標(biāo)式emoji
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\u2600-\u2B55" u"\U00010000-\U0010ffff"
"]+", flags=re.UNICODE)
text = emoji_pattern.sub(r'', text)
if weibo_topic:
# text = re.sub(r"#\S+#", "", text) # 去除話(huà)題內(nèi)容
# re.sub(r"#\S+\s?\S+#", "", "#ds e哈 oa#分發(fā)")
text = re.sub(r"#[^#]*?#","",text)
if linesep2space:
text = text.replace("\n", " ") # 不需要換行的時(shí)候變成1行
if deduplicate_space:
text = re.sub(r"(\s)+", r"", text) # 合并正文中過(guò)多的空格
# text = re.sub(r"(\s)+", r"\1", text) # 合并正文中過(guò)多的空格
# if t2s:
# cc = OpenCC('t2s')
# text = cc.convert(text)
assert hasattr(stop_terms, "__iter__"), Exception("去除的詞語(yǔ)必須是一個(gè)可迭代對(duì)象")
if type(stop_terms) == str:
text = text.replace(stop_terms, "")
else:
for x in stop_terms:
text = text.replace(x, "")
if remove_puncts:
allpuncs = re.compile(
r"[~,\_《。》、?;:‘’"“”【「】」·!@¥…()—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+]")
text = re.sub(allpuncs, "", text)
return text.strip()

浙公網(wǎng)安備 33010602011771號(hào)