(学习笔记(3)-重叠社区发现Copra算法)

根据彩色香菇的要求，阅读关于发现重复社区的文章findingoverlappingcommunitiesinnetworksbylabelpropagation，并阅读可用于发现重复社区的基于LPA的扩展算法CCA 简而言之，COPRA算法向每个节点添加了标签列表。列表长度为算法的参数v，每个节点最多可以有v个标签。也就是说，最多可以存在于v个社区中。此外，COPRA不再使用LPA算法的退出条件。因为算法往往不会收敛于所有节点的标签不发生变化。因此，该算法跟踪每一回合计算结束后网络上剩余的标签集合的大小，如果连续回合中该值不变化，则认为满足结束条件。 (注意：“两轮不变”并不意味着一直不变，而是三轮可能会变)

该算法仍然存在标签传播算法随机性随机性强、鲁棒性差、容易将所有项目点分配给一个社区等缺点，还可能划分出无意义的社区，纠正措施仍然是1、标签标签

文中给出了比较详细的伪代码(。在此基础上尝试实现了该算法，为了与LPA算法进行比较，这里暂时使用了karate等经典社区写真集，显示结果也在此基础上进行了一些改动。

# coding=utf-8 fromnumpyimport * importtimeimportcopydefloadadjacentmatrixdata (文件名称， vertices ) : adj矩阵=[0forcolinrange (vertices ) forrowinrange (vertices ) ]file_object=open ) filename， lineinenumerate(file_object ) : line=line.strip ) ) t_list=line.split ) () ) 0 t ' ) foryinrange () adj矩阵=mat ) adj矩阵) ) adj矩阵垂直(: degree _ s=[ I， 0 ) forIinrange(vertices ) ) neighbours=[ ] foriinrange (vertices ) ) sums=0foriinrange (vertices ) 3360 forinrange 360=1sums=1neighbours [ I ].append [ j ] # degree _ s=so rree reverse=true (# print degree _ s，neighbours，sums/2 red ) neighbours，sums/2def Propagate(x )，old， aynchronous(:#new[x]={}#洗牌保证随机性(如果一切都相等) random.shuffle neighbours [ x ] )基于相邻节点标签集更新该节点foreachpointinneighbours的3360 foreachlableinold [ each point ] : b=old [ each point ] [ each lable ] [ each lable ].update (each lable : b ) if asynchronous : old [ x ]=copy.deepcopy ) new[x] ) normalize ] new [ x 否则标准化foreachinnew [ x ] : if new [ x ] [ each ]1/float [ v ] : t.append [ each ] if new [ x ]

each for i in range(len(t)): del new[x][t[i]] if len(new[x]) == 0: new[x][maxc] = 1 else: Normalize(new[x])def Normalize(x): sums = 0.0 for each in x: sums += x[each] for each in x: if sums != 0: x[each] = x[each]/sumsdef id_l(l): ids = [] for each in l: ids.append(id_x(each)) return idsdef id_x(x): ids = [] for each in x: ids.append(each) return idsdef count(l): counts = {} for eachpoint in l: for eachlable in eachpoint: if eachlable in counts: n = counts[eachlable] counts.update({eachlable: n+1}) else: counts.update({eachlable: 1}) return countsdef mc(cs1, cs2): #print cs1,cs2 cs = {} for each in cs1: if each in cs2: cs[each] = min(cs1[each], cs2[each]) return csdef Modulartiy(A, coms, sums,vertices): Q = 0.0 for eachc in coms: li = 0 for eachp in coms[eachc]: for eachq in coms[eachc]: li += A[eachp][eachq] li /= 2 di = 0 for eachp in coms[eachc]: for eachq in range(vertices): di += A[eachp][eachq] Q = Q + (li - (di * di) /(sums*4)) Q = Q / float(sums) return Qdef ExtendQ(A,coms,sums,k,o): #k-每个节点的度数 o-每个节点属于的社区数 EQ = 0.0 for eachc in coms: for eachp in coms[eachc]: for eachq in coms[eachc]: EQ += (A[eachp][eachq] - k[eachp][1]*k[eachq][1]/(2*sums)) / (o[eachp]*o[eachq]) EQ = EQ / float(2*sums) return EQdef getcoms(degree_s,neighbours,sums,A,v,vertices): label_new = [{} for i in range(vertices)] label_old = [{i: 1} for i in range(vertices)] minl = {} oldmin = {} flag = True# asynchronous itera = 0# 迭代次数 start = time.clock()# 计时 #同异步迭代过程 while True: ''' if flag: flag = False else: flag = True ''' itera += 1 for each in degree_s: Propagate(each[0], label_old, label_new, neighbours, v, flag) if id_l(label_old) == id_l(label_new): minl = mc(minl, count(label_new)) else: minl = count(label_new) if minl != oldmin: label_old = label_new oldmin = minl else: break print itera,label_old coms = {} sub = {} for each in range(vertices): ids = id_x(label_old[each]) for eachc in ids: if eachc in coms and eachc in sub: coms[eachc].append(each) #elif : sub.update({eachc: set(sub[eachc]) & set(ids)}) else: coms.update({eachc:[each]}) sub.update({eachc:ids}) print 'lastiter',coms #获取每个节点属于的标签数 o = [0 for i in range(vertices)] for eachid in range(vertices): for eachl in coms: if eachid in coms[eachl]: o[eachid] += 1 #去重取标签 for each in sub: if len(sub[each]): for eachc in sub[each]: if eachc != each: coms[eachc] = list(set(coms[eachc]) - set(coms[each])) #标签整理 clusterment = [0 for i in range(vertices)] a = 0 for eachc in coms: if len(coms[eachc])!=0: for e in coms[eachc]: clusterment[e] = a + 1 a += 1 degree_s = sorted(degree_s, key=lambda x: x[0], reverse=False) elapsed = (time.clock() - start) print 't=',elapsed print 'result=',coms print 'clusterment=',clusterment print 'Q =',Modulartiy(A, coms, sums,vertices) print 'EQ =',ExtendQ(A,coms,sums,degree_s,o) #print 'NMI=',NMI(coms,coms) return comsif __name__ == '__main__': #节点个数,V #vertices = [34,115,105,62] #txtlist = ['karate.txt','football.txt','books.txt','dolphins.txt'] vertices = [64,128,256,512] txtlist = ['RN1.txt','RN2.txt','RN3.txt','RN4.txt'] testv = [2,3,4,5] for i in range(len(txtlist)): print txtlist[i],vertices[i] for ev in testv: print 'v=',ev A = LoadAdjacentMatrixData(txtlist[i],vertices[i]) degree_s, neighbours, sums = Degree_Sorting(A,vertices[i]) #print neighbours getcoms(degree_s, neighbours, sums,A,ev,vertices[i])

4-26日更新：
有bug，EQ函数的计算由于float精度的问题不能直接使用公式，需要先将公式变形再计算

def ExtendQ(A,coms,sums,k,o): #k-每个节点的度数 o-每个节点属于的社区数 s = float(2*sums) k = sorted(k, key=lambda x: x[0], reverse=False) at = 0 kt = 0 EQ = 0.0 for eachc in coms: for eachp in coms[eachc]: for eachq in coms[eachc]: at += A[eachp][eachq] / float(o[eachp]*o[eachq]) kt += k[eachp][1]*k[eachq][1] / float(o[eachp]*o[eachq]) EQ = at - kt / s return EQ/s