python 字符相似度對比
1,字
將字符以逗號,句號為一個(gè)段落,在對段落內(nèi)的字符同一字進(jìn)行對比
2,詞
同一詞語進(jìn)行對比
import difflib
def stri_similar(s1,s2):
return difflib.SequenceMatcher(None,s1,s2).quick_ratio()
data1 = '你好啊'
data2 = '你好'
# for i in range(len(data1)):
# s1 = data1[i]
# s2 = data2[i]
# print(stri_similar(s1,s2) )
# print(stri_similar(data1,data2) )
#相似比對比
#順序?qū)Ρ确ǎ簝蓚€(gè)列表按順序?qū)Ρ?缺點(diǎn):計(jì)算量小 ; 優(yōu)點(diǎn):相似度對比準(zhǔn)確度低
#遍歷對比法:一個(gè)列表遍歷另一個(gè)列表的全部,一段對比每一段。 缺點(diǎn):計(jì)算量大 ; 優(yōu)點(diǎn):相似度對比更有準(zhǔn)確度
d = ['長短搭配,', '盡量減少接頭,', '以節(jié)約鋼材。']
text = ['搭配,', '盡少接頭,', '以節(jié)約鋼材。']
def contrast(text,d):
try:
count = 0
for i in range(len(text)): #遍歷段落
count1 = 0
if len(text[i]) > len(d[i]):
c = text[i]
t = d[i]
else :
c = d[i]
t = text[i]
for p in c: #遍歷段落字符
if p in t:
count1 += 1
count += count1/len(c)
# print('{:.2f} {:.2%}'.format(count,count/len(text)))
return count/len(text)
except:
print('報(bào)錯(cuò):列表長度不一樣')
# contrast(text,d)
def contrast2(list1,list2):
count = 0
for i in list1:
count1 = 0
for t in list2:
count1 += contrast([t],[i])
count += count1
print('總量:{:.2f} 相似度:{:.2%}'.format(count,count/len(list1)))
list1 = ['長短搭配,', '盡量減少接頭,', '以節(jié)約鋼材。']
list2 = ['長短搭配,','盡量減少接頭,']
contrast2(list1,list2)
# contrast(list1,list2)
word_list = []
s_vector = []
import jieba
s1 = '我喜歡你啊'
s2 = '我喜歡你'
def max_len(s1,s2):
a,b = len(s1),len(s2)
if a > b:
data =s1
else:
data = s2
return data
print(max_len(s1,s2) )
#第一種補(bǔ)零
#第二種中和
for i in list(jieba.cut(s1)):
print(i)
def merge(s1,s2):
word_list = list(jieba.cut(s1)) #詞典
for i,d in list(jieba.cut(s2)):
if i in word_list :
continue
else:
word_list.append(i)
# 第一步
def vector(data): #詞典匹配 word_list
for i in word_list:
if i in list(jieba.cut(data)):
s_vector.append(1)
else:
s_vector.append(0)
# 第二步
def cosine_Similarity(s1,s2): #余弦
l,l1,l2 = 0,0,0
for i,d in enumerate(s1):
l += s1[i]*s2[i]
l1 += s1[i]**2
l2 += s2[i]**2
return l/((l1**0.5)*(l2**0.5))
def Euclidean_distance(s1,s2): #歐氏距離
vector_list = 0
for i,d in enumerate(s1):
vector_list += (s1[i]-s2[i])**2
return vector_list**0.5
print((2*2)**0.5,2**2)
s1 = [1,1,1,1,1]
s2 = [3,3,3,3,3]
d = Euclidean_distance(s1,s2)
print(d)
d = cosine_Similarity(s1,s2)
print(d)

浙公網(wǎng)安備 33010602011771號