SNAP Library 2.0, User Reference
2013-05-13 16:33:57
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 #include "stdafx.h" 00002 #include "agmfit.h" 00003 #include "agm.h" 00004 00006 // AGM fitting 00007 00008 void TAGMFit::Save(TSOut& SOut) { 00009 G->Save(SOut); 00010 CIDNSetV.Save(SOut); 00011 EdgeComVH.Save(SOut); 00012 NIDComVH.Save(SOut); 00013 ComEdgesV.Save(SOut); 00014 PNoCom.Save(SOut); 00015 LambdaV.Save(SOut); 00016 NIDCIDPrH.Save(SOut); 00017 NIDCIDPrS.Save(SOut); 00018 MinLambda.Save(SOut); 00019 MaxLambda.Save(SOut); 00020 RegCoef.Save(SOut); 00021 BaseCID.Save(SOut); 00022 } 00023 00024 void TAGMFit::Load(TSIn& SIn, const int& RndSeed) { 00025 G = TUNGraph::Load(SIn); 00026 CIDNSetV.Load(SIn); 00027 EdgeComVH.Load(SIn); 00028 NIDComVH.Load(SIn); 00029 ComEdgesV.Load(SIn); 00030 PNoCom.Load(SIn); 00031 LambdaV.Load(SIn); 00032 NIDCIDPrH.Load(SIn); 00033 NIDCIDPrS.Load(SIn); 00034 MinLambda.Load(SIn); 00035 MaxLambda.Load(SIn); 00036 RegCoef.Load(SIn); 00037 BaseCID.Load(SIn); 00038 Rnd.PutSeed(RndSeed); 00039 } 00040 00041 // Randomly initialize bipartite community affiliation graph. 00042 void TAGMFit::RandomInitCmtyVV(const int InitComs, const double ComSzAlpha, const double MemAlpha, const int MinComSz, const int MaxComSz, const int MinMem, const int MaxMem) { 00043 TVec<TIntV> InitCmtyVV(InitComs, 0); 00044 TAGMUtil::GenCmtyVVFromPL(InitCmtyVV, G, G->GetNodes(), InitComs, ComSzAlpha, MemAlpha, MinComSz, MaxComSz, 00045 MinMem, MaxMem, Rnd); 00046 SetCmtyVV(InitCmtyVV); 00047 } 00048 00049 // For each (u, v) in edges, precompute C_uv (the set of communities u and v share). 00050 void TAGMFit::GetEdgeJointCom() { 00051 ComEdgesV.Gen(CIDNSetV.Len()); 00052 EdgeComVH.Gen(G->GetEdges()); 00053 for (TUNGraph::TNodeI SrcNI = G->BegNI(); SrcNI < G->EndNI(); SrcNI++) { 00054 int SrcNID = SrcNI.GetId(); 00055 for (int v = 0; v < SrcNI.GetDeg(); v++) { 00056 int DstNID = SrcNI.GetNbrNId(v); 00057 if (SrcNID >= DstNID) { continue; } 00058 TIntSet JointCom; 00059 IAssert(NIDComVH.IsKey(SrcNID)); 00060 IAssert(NIDComVH.IsKey(DstNID)); 00061 TAGMUtil::GetIntersection(NIDComVH.GetDat(SrcNID), NIDComVH.GetDat(DstNID), JointCom); 00062 EdgeComVH.AddDat(TIntPr(SrcNID,DstNID),JointCom); 00063 for (int k = 0; k < JointCom.Len(); k++) { 00064 ComEdgesV[JointCom[k]]++; 00065 } 00066 } 00067 } 00068 IAssert(EdgeComVH.Len() == G->GetEdges()); 00069 } 00070 00071 // Set epsilon by the default value. 00072 void TAGMFit::SetDefaultPNoCom() { 00073 PNoCom = 1.0 / (double) G->GetNodes() / (double) G->GetNodes(); 00074 } 00075 00076 double TAGMFit::Likelihood(const TFltV& NewLambdaV, double& LEdges, double& LNoEdges) { 00077 IAssert(CIDNSetV.Len() == NewLambdaV.Len()); 00078 IAssert(ComEdgesV.Len() == CIDNSetV.Len()); 00079 LEdges = 0.0; LNoEdges = 0.0; 00080 for (int e = 0; e < EdgeComVH.Len(); e++) { 00081 TIntSet& JointCom = EdgeComVH[e]; 00082 double LambdaSum = SelectLambdaSum(NewLambdaV, JointCom); 00083 double Puv = 1 - exp(- LambdaSum); 00084 if (JointCom.Len() == 0) { Puv = PNoCom; } 00085 IAssert(! _isnan(log(Puv))); 00086 LEdges += log(Puv); 00087 } 00088 for (int k = 0; k < NewLambdaV.Len(); k++) { 00089 int MaxEk = CIDNSetV[k].Len() * (CIDNSetV[k].Len() - 1) / 2; 00090 int NotEdgesInCom = MaxEk - ComEdgesV[k]; 00091 if(NotEdgesInCom > 0) { 00092 if (LNoEdges >= TFlt::Mn + double(NotEdgesInCom) * NewLambdaV[k]) { 00093 LNoEdges -= double(NotEdgesInCom) * NewLambdaV[k]; 00094 } 00095 } 00096 } 00097 double LReg = 0.0; 00098 if (RegCoef > 0.0) { 00099 LReg = - RegCoef * TLinAlg::SumVec(NewLambdaV); 00100 } 00101 return LEdges + LNoEdges + LReg; 00102 } 00103 00104 double TAGMFit::Likelihood() { 00105 return Likelihood(LambdaV); 00106 } 00107 00108 // Step size search for updating P_c (which is parametarized by lambda). 00109 double TAGMFit::GetStepSizeByLineSearchForLambda(const TFltV& DeltaV, const TFltV& GradV, const double& Alpha, const double& Beta) { 00110 double StepSize = 1.0; 00111 double InitLikelihood = Likelihood(); 00112 IAssert(LambdaV.Len() == DeltaV.Len()); 00113 TFltV NewLambdaV(LambdaV.Len()); 00114 for (int iter = 0; ; iter++) { 00115 for (int i = 0; i < LambdaV.Len(); i++) { 00116 NewLambdaV[i] = LambdaV[i] + StepSize * DeltaV[i]; 00117 if (NewLambdaV[i] < MinLambda) { NewLambdaV[i] = MinLambda; } 00118 if (NewLambdaV[i] > MaxLambda) { NewLambdaV[i] = MaxLambda; } 00119 } 00120 if (Likelihood(NewLambdaV) < InitLikelihood + Alpha * StepSize * TLinAlg::DotProduct(GradV, DeltaV)) { 00121 StepSize *= Beta; 00122 } else { 00123 break; 00124 } 00125 } 00126 return StepSize; 00127 } 00128 00129 // Gradient descent for p_c while fixing community affiliation graph (CAG). 00130 int TAGMFit::MLEGradAscentGivenCAG(const double& Thres, const int& MaxIter, const TStr PlotNm) { 00131 int Edges = G->GetEdges(); 00132 TExeTm ExeTm; 00133 TFltV GradV(LambdaV.Len()); 00134 int iter = 0; 00135 TIntFltPrV IterLV, IterGradNormV; 00136 double GradCutOff = 1000; 00137 for (iter = 0; iter < MaxIter; iter++) { 00138 GradLogLForLambda(GradV); //if gradient is going out of the boundary, cut off 00139 for (int i = 0; i < LambdaV.Len(); i++) { 00140 if (GradV[i] < -GradCutOff) { GradV[i] = -GradCutOff; } 00141 if (GradV[i] > GradCutOff) { GradV[i] = GradCutOff; } 00142 if (LambdaV[i] <= MinLambda && GradV[i] < 0) { GradV[i] = 0.0; } 00143 if (LambdaV[i] >= MaxLambda && GradV[i] > 0) { GradV[i] = 0.0; } 00144 } 00145 double Alpha = 0.15, Beta = 0.2; 00146 if (Edges > Kilo(100)) { Alpha = 0.00015; Beta = 0.3;} 00147 double LearnRate = GetStepSizeByLineSearchForLambda(GradV, GradV, Alpha, Beta); 00148 if (TLinAlg::Norm(GradV) < Thres) { break; } 00149 for (int i = 0; i < LambdaV.Len(); i++) { 00150 double Change = LearnRate * GradV[i]; 00151 LambdaV[i] += Change; 00152 if(LambdaV[i] < MinLambda) { LambdaV[i] = MinLambda;} 00153 if(LambdaV[i] > MaxLambda) { LambdaV[i] = MaxLambda;} 00154 } 00155 if (! PlotNm.Empty()) { 00156 double L = Likelihood(); 00157 IterLV.Add(TIntFltPr(iter, L)); 00158 IterGradNormV.Add(TIntFltPr(iter, TLinAlg::Norm(GradV))); 00159 } 00160 } 00161 if (! PlotNm.Empty()) { 00162 TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); 00163 TGnuPlot::PlotValV(IterGradNormV, PlotNm + ".gradnorm_Q"); 00164 printf("MLE for Lambda completed with %d iterations(%s)\n",iter,ExeTm.GetTmStr()); 00165 } 00166 return iter; 00167 } 00168 00169 void TAGMFit::RandomInit(const int& MaxK) { 00170 CIDNSetV.Clr(); 00171 for (int c = 0; c < MaxK; c++) { 00172 CIDNSetV.Add(); 00173 int NC = Rnd.GetUniDevInt(G -> GetNodes()); 00174 TUNGraph::TNodeI NI = G -> GetRndNI(); 00175 CIDNSetV.Last().AddKey(NI.GetId()); 00176 for (int v = 0; v < NC; v++) { 00177 NI = G->GetNI(NI.GetNbrNId(Rnd.GetUniDevInt(NI.GetDeg()))); 00178 CIDNSetV.Last().AddKey(NI.GetId()); 00179 } 00180 } 00181 InitNodeData(); 00182 SetDefaultPNoCom(); 00183 } 00184 00185 // Initialize node community memberships using best neighborhood communities (see D. Gleich et al. KDD'12). 00186 void TAGMFit::NeighborComInit(const int InitComs) { 00187 CIDNSetV.Gen(InitComs); 00188 const int Edges = G->GetEdges(); 00189 TFltIntPrV NIdPhiV(G->GetNodes(), 0); 00190 TIntSet InvalidNIDS(G->GetNodes()); 00191 TIntV ChosenNIDV(InitComs, 0); //FOR DEBUG 00192 TExeTm RunTm; 00193 //compute conductance of neighborhood community 00194 TIntV NIdV; 00195 G->GetNIdV(NIdV); 00196 for (int u = 0; u < NIdV.Len(); u++) { 00197 TIntSet NBCmty(G->GetNI(NIdV[u]).GetDeg() + 1); 00198 double Phi; 00199 if (G->GetNI(NIdV[u]).GetDeg() < 5) { //do not include nodes with too few degree 00200 Phi = 1.0; 00201 } else { 00202 TAGMUtil::GetNbhCom(G, NIdV[u], NBCmty); 00203 IAssert(NBCmty.Len() == G->GetNI(NIdV[u]).GetDeg() + 1); 00204 Phi = TAGMUtil::GetConductance(G, NBCmty, Edges); 00205 } 00206 NIdPhiV.Add(TFltIntPr(Phi, NIdV[u])); 00207 } 00208 NIdPhiV.Sort(true); 00209 printf("conductance computation completed [%s]\n", RunTm.GetTmStr()); 00210 fflush(stdout); 00211 //choose nodes with local minimum in conductance 00212 int CurCID = 0; 00213 for (int ui = 0; ui < NIdPhiV.Len(); ui++) { 00214 int UID = NIdPhiV[ui].Val2; 00215 fflush(stdout); 00216 if (InvalidNIDS.IsKey(UID)) { continue; } 00217 ChosenNIDV.Add(UID); //FOR DEBUG 00218 //add the node and its neighbors to the current community 00219 CIDNSetV[CurCID].AddKey(UID); 00220 TUNGraph::TNodeI NI = G->GetNI(UID); 00221 fflush(stdout); 00222 for (int e = 0; e < NI.GetDeg(); e++) { 00223 CIDNSetV[CurCID].AddKey(NI.GetNbrNId(e)); 00224 } 00225 //exclude its neighbors from the next considerations 00226 for (int e = 0; e < NI.GetDeg(); e++) { 00227 InvalidNIDS.AddKey(NI.GetNbrNId(e)); 00228 } 00229 CurCID++; 00230 fflush(stdout); 00231 if (CurCID >= InitComs) { break; } 00232 } 00233 if (InitComs > CurCID) { 00234 printf("%d communities needed to fill randomly\n", InitComs - CurCID); 00235 } 00236 //assign a member to zero-member community (if any) 00237 for (int c = 0; c < CIDNSetV.Len(); c++) { 00238 if (CIDNSetV[c].Len() == 0) { 00239 int ComSz = 10; 00240 for (int u = 0; u < ComSz; u++) { 00241 int UID = G->GetRndNI().GetId(); 00242 CIDNSetV[c].AddKey(UID); 00243 } 00244 } 00245 } 00246 InitNodeData(); 00247 SetDefaultPNoCom(); 00248 } 00249 00250 // Add epsilon community (base community which includes all nodes) into community affiliation graph. It means that we fit for epsilon. 00251 void TAGMFit::AddBaseCmty() { 00252 TVec<TIntV> CmtyVV; 00253 GetCmtyVV(CmtyVV); 00254 TIntV TmpV = CmtyVV[0]; 00255 CmtyVV.Add(TmpV); 00256 G->GetNIdV(CmtyVV[0]); 00257 IAssert(CIDNSetV.Len() + 1 == CmtyVV.Len()); 00258 SetCmtyVV(CmtyVV); 00259 InitNodeData(); 00260 BaseCID = 0; 00261 PNoCom = 0.0; 00262 } 00263 00264 void TAGMFit::InitNodeData() { 00265 TSnap::DelSelfEdges(G); 00266 NIDComVH.Gen(G->GetNodes()); 00267 for (TUNGraph::TNodeI NI = G->BegNI(); NI < G->EndNI(); NI++) { 00268 NIDComVH.AddDat(NI.GetId()); 00269 } 00270 TAGMUtil::GetNodeMembership(NIDComVH, CIDNSetV); 00271 GetEdgeJointCom(); 00272 LambdaV.Gen(CIDNSetV.Len()); 00273 for (int c = 0; c < CIDNSetV.Len(); c++) { 00274 int MaxE = (CIDNSetV[c].Len()) * (CIDNSetV[c].Len() - 1) / 2; 00275 if (MaxE < 2) { 00276 LambdaV[c] = MaxLambda; 00277 } 00278 else{ 00279 LambdaV[c] = -log((double) (MaxE - ComEdgesV[c]) / MaxE); 00280 } 00281 if (LambdaV[c] > MaxLambda) { LambdaV[c] = MaxLambda; } 00282 if (LambdaV[c] < MinLambda) { LambdaV[c] = MinLambda; } 00283 } 00284 NIDCIDPrS.Gen(G->GetNodes() * 10); 00285 for (int c = 0; c < CIDNSetV.Len(); c++) { 00286 for (TIntSet::TIter SI = CIDNSetV[c].BegI(); SI < CIDNSetV[c].EndI(); SI++) { 00287 NIDCIDPrS.AddKey(TIntPr(SI.GetKey(), c)); 00288 } 00289 } 00290 } 00291 00292 // After MCMC, NID leaves community CID. 00293 void TAGMFit::LeaveCom(const int& NID, const int& CID) { 00294 TUNGraph::TNodeI NI = G->GetNI(NID); 00295 for (int e = 0; e < NI.GetDeg(); e++) { 00296 int VID = NI.GetNbrNId(e); 00297 if (NIDComVH.GetDat(VID).IsKey(CID)) { 00298 TIntPr SrcDstNIDPr = TIntPr(TMath::Mn(NID,VID), TMath::Mx(NID,VID)); 00299 EdgeComVH.GetDat(SrcDstNIDPr).DelKey(CID); 00300 ComEdgesV[CID]--; 00301 } 00302 } 00303 CIDNSetV[CID].DelKey(NID); 00304 NIDComVH.GetDat(NID).DelKey(CID); 00305 NIDCIDPrS.DelKey(TIntPr(NID, CID)); 00306 } 00307 00308 // After MCMC, NID joins community CID. 00309 void TAGMFit::JoinCom(const int& NID, const int& JoinCID) { 00310 TUNGraph::TNodeI NI = G->GetNI(NID); 00311 for (int e = 0; e < NI.GetDeg(); e++) { 00312 int VID = NI.GetNbrNId(e); 00313 if (NIDComVH.GetDat(VID).IsKey(JoinCID)) { 00314 TIntPr SrcDstNIDPr = TIntPr(TMath::Mn(NID,VID), TMath::Mx(NID,VID)); 00315 EdgeComVH.GetDat(SrcDstNIDPr).AddKey(JoinCID); 00316 ComEdgesV[JoinCID]++; 00317 } 00318 } 00319 CIDNSetV[JoinCID].AddKey(NID); 00320 NIDComVH.GetDat(NID).AddKey(JoinCID); 00321 NIDCIDPrS.AddKey(TIntPr(NID, JoinCID)); 00322 } 00323 00324 // Sample transition: Choose among (join, leave, switch), and then sample (NID, CID). 00325 void TAGMFit::SampleTransition(int& NID, int& JoinCID, int& LeaveCID, double& DeltaL) { 00326 int Option = Rnd.GetUniDevInt(3); //0:Join 1:Leave 2:Switch 00327 if (NIDCIDPrS.Len() <= 1) { Option = 0; } //if there is only one node membership, only join is possible. 00328 int TryCnt = 0; 00329 const int MaxTryCnt = G->GetNodes(); 00330 DeltaL = TFlt::Mn; 00331 if (Option == 0) { 00332 do { 00333 JoinCID = Rnd.GetUniDevInt(CIDNSetV.Len()); 00334 NID = G->GetRndNId(); 00335 } while (TryCnt++ < MaxTryCnt && NIDCIDPrS.IsKey(TIntPr(NID, JoinCID))); 00336 if (TryCnt < MaxTryCnt) { //if successfully find a move 00337 DeltaL = SeekJoin(NID, JoinCID); 00338 } 00339 } 00340 else if (Option == 1) { 00341 do { 00342 TIntPr NIDCIDPr = NIDCIDPrS.GetKey(NIDCIDPrS.GetRndKeyId(Rnd)); 00343 NID = NIDCIDPr.Val1; 00344 LeaveCID = NIDCIDPr.Val2; 00345 } while (TryCnt++ < MaxTryCnt && LeaveCID == BaseCID); 00346 if (TryCnt < MaxTryCnt) {//if successfully find a move 00347 DeltaL = SeekLeave(NID, LeaveCID); 00348 } 00349 } 00350 else{ 00351 do { 00352 TIntPr NIDCIDPr = NIDCIDPrS.GetKey(NIDCIDPrS.GetRndKeyId(Rnd)); 00353 NID = NIDCIDPr.Val1; 00354 LeaveCID = NIDCIDPr.Val2; 00355 } while (TryCnt++ < MaxTryCnt && (NIDComVH.GetDat(NID).Len() == CIDNSetV.Len() || LeaveCID == BaseCID)); 00356 do { 00357 JoinCID = Rnd.GetUniDevInt(CIDNSetV.Len()); 00358 } while (TryCnt++ < G->GetNodes() && NIDCIDPrS.IsKey(TIntPr(NID, JoinCID))); 00359 if (TryCnt < MaxTryCnt) {//if successfully find a move 00360 DeltaL = SeekSwitch(NID, LeaveCID, JoinCID); 00361 } 00362 } 00363 } 00364 00366 void TAGMFit::RunMCMC(const int& MaxIter, const int& EvalLambdaIter, const TStr& PlotFPrx) { 00367 TExeTm IterTm, TotalTm; 00368 double PrevL = Likelihood(), DeltaL = 0; 00369 double BestL = PrevL; 00370 printf("initial likelihood = %f\n",PrevL); 00371 TIntFltPrV IterTrueLV, IterJoinV, IterLeaveV, IterAcceptV, IterSwitchV, IterLBV; 00372 TIntPrV IterTotMemV; 00373 TIntV IterV; 00374 TFltV BestLV; 00375 TVec<TIntSet> BestCmtySetV; 00376 int SwitchCnt = 0, LeaveCnt = 0, JoinCnt = 0, AcceptCnt = 0, ProbBinSz; 00377 int Nodes = G->GetNodes(), Edges = G->GetEdges(); 00378 TExeTm PlotTm; 00379 ProbBinSz = TMath::Mx(1000, G->GetNodes() / 10); //bin to compute probabilities 00380 IterLBV.Add(TIntFltPr(1, BestL)); 00381 00382 for (int iter = 0; iter < MaxIter; iter++) { 00383 IterTm.Tick(); 00384 int NID = -1; 00385 int JoinCID = -1, LeaveCID = -1; 00386 SampleTransition(NID, JoinCID, LeaveCID, DeltaL); //sample a move 00387 double OptL = PrevL; 00388 if (DeltaL > 0 || Rnd.GetUniDev() < exp(DeltaL)) { //if it is accepted 00389 IterTm.Tick(); 00390 if (LeaveCID > -1 && LeaveCID != BaseCID) { LeaveCom(NID, LeaveCID); } 00391 if (JoinCID > -1 && JoinCID != BaseCID) { JoinCom(NID, JoinCID); } 00392 if (LeaveCID > -1 && JoinCID > -1 && JoinCID != BaseCID && LeaveCID != BaseCID) { SwitchCnt++; } 00393 else if (LeaveCID > -1 && LeaveCID != BaseCID) { LeaveCnt++;} 00394 else if (JoinCID > -1 && JoinCID != BaseCID) { JoinCnt++;} 00395 AcceptCnt++; 00396 if ((iter + 1) % EvalLambdaIter == 0) { 00397 IterTm.Tick(); 00398 MLEGradAscentGivenCAG(0.01, 3); 00399 OptL = Likelihood(); 00400 } 00401 else{ 00402 OptL = PrevL + DeltaL; 00403 } 00404 if (BestL <= OptL && CIDNSetV.Len() > 0) { 00405 BestCmtySetV = CIDNSetV; 00406 BestLV = LambdaV; 00407 BestL = OptL; 00408 } 00409 } 00410 if (iter > 0 && (iter % ProbBinSz == 0) && PlotFPrx.Len() > 0) { 00411 IterLBV.Add(TIntFltPr(iter, OptL)); 00412 IterSwitchV.Add(TIntFltPr(iter, (double) SwitchCnt / (double) AcceptCnt)); 00413 IterLeaveV.Add(TIntFltPr(iter, (double) LeaveCnt / (double) AcceptCnt)); 00414 IterJoinV.Add(TIntFltPr(iter, (double) JoinCnt / (double) AcceptCnt)); 00415 IterAcceptV.Add(TIntFltPr(iter, (double) AcceptCnt / (double) ProbBinSz)); 00416 SwitchCnt = JoinCnt = LeaveCnt = AcceptCnt = 0; 00417 } 00418 PrevL = OptL; 00419 if ((iter + 1) % 10000 == 0) { 00420 printf("\r%d iterations completed [%.2f]", iter, (double) iter / (double) MaxIter); 00421 } 00422 } 00423 00424 // plot the likelihood and acceptance probabilities if the plot file name is given 00425 if (PlotFPrx.Len() > 0) { 00426 TGnuPlot GP1; 00427 GP1.AddPlot(IterLBV, gpwLinesPoints, "likelihood"); 00428 GP1.SetDataPlotFNm(PlotFPrx + ".likelihood.tab", PlotFPrx + ".likelihood.plt"); 00429 TStr TitleStr = TStr::Fmt(" N:%d E:%d", Nodes, Edges); 00430 GP1.SetTitle(PlotFPrx + ".likelihood" + TitleStr); 00431 GP1.SavePng(PlotFPrx + ".likelihood.png"); 00432 00433 TGnuPlot GP2; 00434 GP2.AddPlot(IterSwitchV, gpwLinesPoints, "Switch"); 00435 GP2.AddPlot(IterLeaveV, gpwLinesPoints, "Leave"); 00436 GP2.AddPlot(IterJoinV, gpwLinesPoints, "Join"); 00437 GP2.AddPlot(IterAcceptV, gpwLinesPoints, "Accept"); 00438 GP2.SetTitle(PlotFPrx + ".transition"); 00439 GP2.SetDataPlotFNm(PlotFPrx + "transition_prob.tab", PlotFPrx + "transition_prob.plt"); 00440 GP2.SavePng(PlotFPrx + "transition_prob.png"); 00441 } 00442 CIDNSetV = BestCmtySetV; 00443 LambdaV = BestLV; 00444 00445 InitNodeData(); 00446 MLEGradAscentGivenCAG(0.001, 100); 00447 printf("\nMCMC completed (best likelihood: %.2f) [%s]\n", BestL, TotalTm.GetTmStr()); 00448 } 00449 00450 // Returns \v QV, a vector of (1 - p_c) for each community c. 00451 void TAGMFit::GetQV(TFltV& OutV) { 00452 OutV.Gen(LambdaV.Len()); 00453 for (int i = 0; i < LambdaV.Len(); i++) { 00454 OutV[i] = exp(- LambdaV[i]); 00455 } 00456 } 00457 00458 // Remove empty communities. 00459 int TAGMFit::RemoveEmptyCom() { 00460 int DelCnt = 0; 00461 for (int c = 0; c < CIDNSetV.Len(); c++) { 00462 if (CIDNSetV[c].Len() == 0) { 00463 CIDNSetV[c] = CIDNSetV.Last(); 00464 CIDNSetV.DelLast(); 00465 LambdaV[c] = LambdaV.Last(); 00466 LambdaV.DelLast(); 00467 DelCnt++; 00468 c--; 00469 } 00470 } 00471 return DelCnt; 00472 } 00473 00474 // Compute the change in likelihood (Delta) if node UID leaves community CID. 00475 double TAGMFit::SeekLeave(const int& UID, const int& CID) { 00476 IAssert(CIDNSetV[CID].IsKey(UID)); 00477 IAssert(G->IsNode(UID)); 00478 double Delta = 0.0; 00479 TUNGraph::TNodeI NI = G->GetNI(UID); 00480 int NbhsInC = 0; 00481 for (int e = 0; e < NI.GetDeg(); e++) { 00482 const int VID = NI.GetNbrNId(e); 00483 if (! NIDComVH.GetDat(VID).IsKey(CID)) { continue; } 00484 TIntPr SrcDstNIDPr(TMath::Mn(UID,VID), TMath::Mx(UID,VID)); 00485 TIntSet& JointCom = EdgeComVH.GetDat(SrcDstNIDPr); 00486 double CurPuv, NewPuv, LambdaSum = SelectLambdaSum(JointCom); 00487 CurPuv = 1 - exp(- LambdaSum); 00488 NewPuv = 1 - exp(- LambdaSum + LambdaV[CID]); 00489 IAssert(JointCom.Len() > 0); 00490 if (JointCom.Len() == 1) { 00491 NewPuv = PNoCom; 00492 } 00493 Delta += (log(NewPuv) - log(CurPuv)); 00494 IAssert(!_isnan(Delta)); 00495 NbhsInC++; 00496 } 00497 Delta += LambdaV[CID] * (CIDNSetV[CID].Len() - 1 - NbhsInC); 00498 return Delta; 00499 } 00500 00501 // Compute the change in likelihood (Delta) if node UID joins community CID. 00502 double TAGMFit::SeekJoin(const int& UID, const int& CID) { 00503 IAssert(! CIDNSetV[CID].IsKey(UID)); 00504 double Delta = 0.0; 00505 TUNGraph::TNodeI NI = G->GetNI(UID); 00506 int NbhsInC = 0; 00507 for (int e = 0; e < NI.GetDeg(); e++) { 00508 const int VID = NI.GetNbrNId(e); 00509 if (! NIDComVH.GetDat(VID).IsKey(CID)) { continue; } 00510 TIntPr SrcDstNIDPr(TMath::Mn(UID,VID), TMath::Mx(UID,VID)); 00511 TIntSet& JointCom = EdgeComVH.GetDat(SrcDstNIDPr); 00512 double CurPuv, NewPuv, LambdaSum = SelectLambdaSum(JointCom); 00513 CurPuv = 1 - exp(- LambdaSum); 00514 if (JointCom.Len() == 0) { CurPuv = PNoCom; } 00515 NewPuv = 1 - exp(- LambdaSum - LambdaV[CID]); 00516 Delta += (log(NewPuv) - log(CurPuv)); 00517 IAssert(!_isnan(Delta)); 00518 NbhsInC++; 00519 } 00520 Delta -= LambdaV[CID] * (CIDNSetV[CID].Len() - NbhsInC); 00521 return Delta; 00522 } 00523 00524 // Compute the change in likelihood (Delta) if node UID switches from CurCID to NewCID. 00525 double TAGMFit::SeekSwitch(const int& UID, const int& CurCID, const int& NewCID) { 00526 IAssert(! CIDNSetV[NewCID].IsKey(UID)); 00527 IAssert(CIDNSetV[CurCID].IsKey(UID)); 00528 double Delta = SeekJoin(UID, NewCID) + SeekLeave(UID, CurCID); 00529 //correct only for intersection between new com and current com 00530 TUNGraph::TNodeI NI = G->GetNI(UID); 00531 for (int e = 0; e < NI.GetDeg(); e++) { 00532 const int VID = NI.GetNbrNId(e); 00533 if (! NIDComVH.GetDat(VID).IsKey(CurCID) || ! NIDComVH.GetDat(VID).IsKey(NewCID)) {continue;} 00534 TIntPr SrcDstNIDPr(TMath::Mn(UID,VID), TMath::Mx(UID,VID)); 00535 TIntSet& JointCom = EdgeComVH.GetDat(SrcDstNIDPr); 00536 double CurPuv, NewPuvAfterJoin, NewPuvAfterLeave, NewPuvAfterSwitch, LambdaSum = SelectLambdaSum(JointCom); 00537 CurPuv = 1 - exp(- LambdaSum); 00538 NewPuvAfterLeave = 1 - exp(- LambdaSum + LambdaV[CurCID]); 00539 NewPuvAfterJoin = 1 - exp(- LambdaSum - LambdaV[NewCID]); 00540 NewPuvAfterSwitch = 1 - exp(- LambdaSum - LambdaV[NewCID] + LambdaV[CurCID]); 00541 if (JointCom.Len() == 1 || NewPuvAfterLeave == 0.0) { 00542 NewPuvAfterLeave = PNoCom; 00543 } 00544 Delta += (log(NewPuvAfterSwitch) + log(CurPuv) - log(NewPuvAfterLeave) - log(NewPuvAfterJoin)); 00545 if (_isnan(Delta)) { 00546 printf("NS:%f C:%f NL:%f NJ:%f PNoCom:%f", NewPuvAfterSwitch, CurPuv, NewPuvAfterLeave, NewPuvAfterJoin, PNoCom.Val); 00547 } 00548 IAssert(!_isnan(Delta)); 00549 } 00550 return Delta; 00551 } 00552 00553 // Get communities whose p_c is higher than 1 - QMax. 00554 void TAGMFit::GetCmtyVV(TVec<TIntV>& CmtyVV, const double QMax) { 00555 TFltV TmpQV; 00556 GetCmtyVV(CmtyVV, TmpQV, QMax); 00557 } 00558 00559 void TAGMFit::GetCmtyVV(TVec<TIntV>& CmtyVV, TFltV& QV, const double QMax) { 00560 CmtyVV.Gen(CIDNSetV.Len(), 0); 00561 QV.Gen(CIDNSetV.Len(), 0); 00562 TIntFltH CIDLambdaH(CIDNSetV.Len()); 00563 for (int c = 0; c < CIDNSetV.Len(); c++) { 00564 CIDLambdaH.AddDat(c, LambdaV[c]); 00565 } 00566 CIDLambdaH.SortByDat(false); 00567 for (int c = 0; c < CIDNSetV.Len(); c++) { 00568 int CID = CIDLambdaH.GetKey(c); 00569 IAssert(LambdaV[CID] >= MinLambda); 00570 double Q = exp( - (double) LambdaV[CID]); 00571 if (Q > QMax) { continue; } 00572 TIntV CmtyV; 00573 CIDNSetV[CID].GetKeyV(CmtyV); 00574 if (CmtyV.Len() == 0) { continue; } 00575 if (CID == BaseCID) { //if the community is the base community(epsilon community), discard 00576 IAssert(CmtyV.Len() == G->GetNodes()); 00577 } else { 00578 CmtyVV.Add(CmtyV); 00579 QV.Add(Q); 00580 } 00581 } 00582 } 00583 00584 void TAGMFit::SetCmtyVV(const TVec<TIntV>& CmtyVV) { 00585 CIDNSetV.Gen(CmtyVV.Len()); 00586 for (int c = 0; c < CIDNSetV.Len(); c++) { 00587 CIDNSetV[c].AddKeyV(CmtyVV[c]); 00588 for (int j = 0; j < CmtyVV[c].Len(); j++) { IAssert(G->IsNode(CmtyVV[c][j])); } // check whether the member nodes exist in the graph 00589 } 00590 InitNodeData(); 00591 SetDefaultPNoCom(); 00592 } 00593 00594 // Gradient of likelihood for P_c. 00595 void TAGMFit::GradLogLForLambda(TFltV& GradV) { 00596 GradV.Gen(LambdaV.Len()); 00597 TFltV SumEdgeProbsV(LambdaV.Len()); 00598 for (int e = 0; e < EdgeComVH.Len(); e++) { 00599 TIntSet& JointCom = EdgeComVH[e]; 00600 double LambdaSum = SelectLambdaSum(JointCom); 00601 double Puv = 1 - exp(- LambdaSum); 00602 if (JointCom.Len() == 0) { Puv = PNoCom; } 00603 for (TIntSet::TIter SI = JointCom.BegI(); SI < JointCom.EndI(); SI++) { 00604 SumEdgeProbsV[SI.GetKey()] += (1 - Puv) / Puv; 00605 } 00606 } 00607 for (int k = 0; k < LambdaV.Len(); k++) { 00608 int MaxEk = CIDNSetV[k].Len() * (CIDNSetV[k].Len() - 1) / 2; 00609 int NotEdgesInCom = MaxEk - ComEdgesV[k]; 00610 GradV[k] = SumEdgeProbsV[k] - (double) NotEdgesInCom; 00611 if (LambdaV[k] > 0.0 && RegCoef > 0.0) { //if regularization exists 00612 GradV[k] -= RegCoef; 00613 } 00614 } 00615 } 00616 00617 // Compute sum of lambda_c (which is log (1 - p_c)) over C_uv (ComK). It is used to compute edge probability P_uv. 00618 double TAGMFit::SelectLambdaSum(const TIntSet& ComK) { 00619 return SelectLambdaSum(LambdaV, ComK); 00620 } 00621 00622 double TAGMFit::SelectLambdaSum(const TFltV& NewLambdaV, const TIntSet& ComK) { 00623 double Result = 0.0; 00624 for (TIntSet::TIter SI = ComK.BegI(); SI < ComK.EndI(); SI++) { 00625 IAssert(NewLambdaV[SI.GetKey()] >= 0); 00626 Result += NewLambdaV[SI.GetKey()]; 00627 } 00628 return Result; 00629 } 00630 00631 // Compute the empirical edge probability between a pair of nodes who share no community (epsilon), based on current community affiliations. 00632 double TAGMFit::CalcPNoComByCmtyVV(const int& SamplePairs) { 00633 TIntV NIdV; 00634 G->GetNIdV(NIdV); 00635 uint64 PairNoCom = 0, EdgesNoCom = 0; 00636 for (int u = 0; u < NIdV.Len(); u++) { 00637 for (int v = u + 1; v < NIdV.Len(); v++) { 00638 int SrcNID = NIdV[u], DstNID = NIdV[v]; 00639 TIntSet JointCom; 00640 TAGMUtil::GetIntersection(NIDComVH.GetDat(SrcNID),NIDComVH.GetDat(DstNID),JointCom); 00641 if(JointCom.Len() == 0) { 00642 PairNoCom++; 00643 if (G->IsEdge(SrcNID, DstNID)) { EdgesNoCom++; } 00644 if (SamplePairs > 0 && PairNoCom >= (uint64) SamplePairs) { break; } 00645 } 00646 } 00647 if (SamplePairs > 0 && PairNoCom >= (uint64) SamplePairs) { break; } 00648 } 00649 double DefaultVal = 1.0 / (double)G->GetNodes() / (double)G->GetNodes(); 00650 if (EdgesNoCom > 0) { 00651 PNoCom = (double) EdgesNoCom / (double) PairNoCom; 00652 } else { 00653 PNoCom = DefaultVal; 00654 } 00655 printf("%s / %s edges without joint com detected (PNoCom = %f)\n", TUInt64::GetStr(EdgesNoCom).CStr(), TUInt64::GetStr(PairNoCom).CStr(), PNoCom.Val); 00656 return PNoCom; 00657 } 00658 00659 void TAGMFit::PrintSummary() { 00660 TIntFltH CIDLambdaH(CIDNSetV.Len()); 00661 for (int c = 0; c < CIDNSetV.Len(); c++) { 00662 CIDLambdaH.AddDat(c, LambdaV[c]); 00663 } 00664 CIDLambdaH.SortByDat(false); 00665 int Coms = 0; 00666 for (int i = 0; i < LambdaV.Len(); i++) { 00667 int CID = CIDLambdaH.GetKey(i); 00668 if (LambdaV[CID] <= 0.0001) { continue; } 00669 printf("P_c : %.3f Com Sz: %d, Total Edges inside: %d \n", 1.0 - exp(- LambdaV[CID]), CIDNSetV[CID].Len(), (int) ComEdgesV[CID]); 00670 Coms++; 00671 } 00672 printf("%d Communities, Total Memberships = %d, Likelihood = %.2f, Epsilon = %f\n", Coms, NIDCIDPrS.Len(), Likelihood(), PNoCom.Val); 00673 }