# simhash+汉明距离计算文本相似度

XAMPP下载 1116浏览

****由于最近需要做大规模的文本相似度的计算，所以用到了simhash+汉明距离来快速计算文本的相似度。**

**simhash的原理如下图:其中的weight采用的是jieba的tf-idf的结果。****

clipboard.png

**附上python3的源代码:**

import math

import jieba

import jieba.analyse

class SimHash(object):

def __init__(self):

pass

def getBinStr(self, source):

if source == “”:

return 0

else:

x = ord(source[0]) << 7

m = 1000003

mask = 2 ** 128 – 1

for c in source:

x = ((x * m) ^ ord(c)) & mask

x ^= len(source)

if x == -1:

x = -2

x = bin(x).replace(‘0b’, ”).zfill(64)[-64:]

return str(x)

def getWeight(self, source):

# fake weight with keyword

return ord(source)

def unwrap_weight(self, arr):

ret = “”

for item in arr:

tmp = 0

if int(item) > 0:

tmp = 1

ret += str(tmp)

return ret

def simHash(self, rawstr):

seg = jieba.cut(rawstr)

keywords = jieba.analyse.extract_tags(“|”.join(seg), topK=100, withWeight=True)

ret = []

for keyword, weight in keywords:

binstr = self.getBinStr(keyword)

keylist = []

for c in binstr:

weight = math.ceil(weight)

if c == “1”:

keylist.append(int(weight))

else:

keylist.append(-int(weight))

ret.append(keylist)

# 对列表进行”降维”

rows = len(ret)

cols = len(ret[0])

result = []

for i in range(cols):

tmp = 0

for j in range(rows):

tmp += int(ret[j][i])

if tmp > 0:

tmp = “1”

elif tmp <= 0:

tmp = “0”

result.append(tmp)

return “”.join(result)

def getDistince(self, hashstr1, hashstr2):

length = 0

for index, char in enumerate(hashstr1):

if char == hashstr2[index]:

continue

else:

length += 1

return length

if name == “__main__”:

simhash = SimHash()

s1 = u’I am very happy’

s2 = u’I am very happu’

hash1 = simhash.simHash(s1)

hash2 = simhash.simHash(s2)

distince = simhash.getDistince(hash1, hash2)

value = 5

print(“海明距离：”, distince, “判定距离：”, value, “是否相似：”, distince<=value)