神经网络优化算法,github

我听说SimHash很强，在计算庞大文档的相似度方面很有效率。调查一下文档，大致流程如下。

大致流程是，根据分词、词频计算哈希序列，计算每个单词最终相同长度，去掉维度，计算汉明距离。

# coding 3360 utf8 importmathimportjiebaimportjieba.analyseclasssimhash (object ) : def _ _ init _ (self ) 3360 pash sourit ord ) source[0] ) 7m=1000003 mask=2* * 128-1 forcinsource 3360 maskx ^=len (source ) if x==-13: x=-2x=ggdbh '').zfill ) )-643360 )打印源) : # fakeweightwithkeywordreturnord (source ) defunwrap_weight ) self， arr ) : ret=' ' foriteminarr : tmp=0if int (item ) 0:tmp=1ret=str(tmp ) returnretdefSimhash ) self，rawstr with weight=true (打印) keywords ) ret=[] for keyword， weightinkeywords 3360 ggdb hstr=self.get binstr (keyword ) keylist=[] forcinggdbhstr : weight=math.ceil (weight ) ) ) 65 IFC=='1' : keylist.append (int (weight ) ) (result=[]forIinrange(cols ) : tmp=0forjinrange (rows ) :tmp=int(ret ) j ) () () ) ) ) ) )。 if tmp 0: tmp='1' elif tmp=0: tmp='0' result.append (tmp ) return ).join(result ) defgetdistince(self ) hashstr2) :长度=0for index，charinenumerate(hashstr1) : if char==hashstr2[index] 3360 continue else 3360 length=1returnlengthif _ name _=' _ main _ ' 3360 sim hash=sim hash (S1=' 100 VX ' withopen (a ' r ' ) asfile:S1=''.join ) file.readlines ) ) file.close ) ) withopen ) b.thopen ) ) as file 33330 s1='this is just test for simhash，hereisthedifference ' # S2=' thisisatestforsimhash， hereisthedifference ' # print (sim hash.get binstr (S1 ) ) print (sim hash.get binstr (S2 ) ) has h1=sim hash.sim hash DDD hash2) ) value=math.sqrt ) len(S1 ) **2len ) S2 ) **2) value=5 print ) ) distince=value )计算

buildingprefixdictfromthedefaultdictionary . loadingmodelfromcache/var/folders/d0/D4 zzr4n 51m7_ VJ9ryfb 633 PC 0000 gn/d4zr4n 51m7_ VJ9ryfb 633 PC 0000 Jie ba.cacheloadingmodelcost 0.764 seconds.prefixdicthasbeenbuiltsuccesfully .汉明距离： 1判定距离： 5为

3359 blog.csdn.net/gzt 940726/article/details/804604193359 blog.csdn.net/madujin/article/details/53152666