參考鏈接:https://zhuanlan.zhihu.com/p/88938220
https://blog.csdn.net/yjw123456/article/details/107923566
https://blog.csdn.net/betterzl/article/details/109983541
列表的交并差:https://blog.csdn.net/qdPython/article/details/118802922
jieba的使用:https://blog.csdn.net/lukabruce/article/details/82351742
import random,jieba
class Similarity():
def __init__(self,a,b):
# 分詞
self.a = jieba.lcut(a)
self.b = jieba.lcut(b)
# 詞袋 a,b的并集
self.word_bag = list(set(self.a).union(set(self.b)))
print('詞袋:',self.word_bag)
# 詞頻
self.wf1 = self.word_frequency(self.a)
self.wf2 = self.word_frequency(self.b)
print('a詞頻:',self.wf1)
print('b詞頻:',self.wf2)
# 統計詞頻
def word_frequency(self,word):
c = []
for i in self.word_bag:
if i in word:
c.append(1)
else:
c.append(0)
return c
# 歐氏距離
def euclidean_distance(self):
# 統計
count = 0
for i, t in enumerate(self.word_bag):
count += (self.wf1[i] - self.wf2[i]) ** 2
print('歐氏距離:',1 / (count ** 0.5))
return 1 / (count ** 0.5)
# 余弦距離
def cosine_distance(self):
# 統計
count1, count2, count3 = 0, 0, 0
for i, t in enumerate(self.word_bag):
count1 += self.wf1[i] * self.wf2[i]
count2 += self.wf1[i] * self.wf1[i]
count3 += self.wf2[i] * self.wf2[i]
print('余弦距離:',count1 / ((count1 ** 0.5) * (count3 ** 0.5)))
return count1 / ((count1 ** 0.5) * (count3 ** 0.5))
# Jacard相似度
def Jacard_distance(self):
count = 0
for i in self.a:
if i in self.b:
count += 1
print('Jacard:',count / len(max(self.a, self.b)))
return count / len(max(self.a, self.b))
# 海明距離
def hamming_distance(self):
c = 0
for i, t in enumerate(self.word_bag):
if self.wf1[i] == self.wf2[i]:
c += 1
else:
c += 0
print('海明距離:',c/len(max(self.a, self.b)))
return c