SNAP Library 2.0, Developer Reference
2013-05-13 16:33:57
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 #pragma once 00002 00003 #include "stdafx.h" 00004 00005 class TGraphAttributes { 00006 public: 00011 TGraphAttributes(PUNGraph G, const char* nodeFeaturePath, const char* groundtruthPath); 00012 ~TGraphAttributes() { 00013 } 00014 00015 PUNGraph G; 00016 TInt NFeatures; 00017 00018 THash<TInt, TIntIntH> NodeFeatures; 00019 THash<TIntPr, TIntIntH> EdgeFeatures; 00020 TVec<TInt> NodeIDs; 00021 TCRef CRef; 00022 00023 TVec<TIntSet> GroundTruth; // Groundtruth communities 00024 }; 00025 00026 typedef TPt<TGraphAttributes> PGraphAttributes; 00027 00028 class TCluster { 00029 public: 00035 TCluster(PGraphAttributes GraphAttributes, TInt K, TFlt Lambda) : 00036 GraphAttributes(GraphAttributes), K(K), Lambda(Lambda) { 00037 Theta = new TFlt[K * GraphAttributes->NFeatures]; 00038 Derivative = new TFlt[K * GraphAttributes->NFeatures]; 00039 for (int k = 0; k < K; k++) { 00040 for (int f = 0; f < GraphAttributes->NFeatures; f++) { 00041 Theta[k * GraphAttributes->NFeatures + f] = 0; 00042 Derivative[k * GraphAttributes->NFeatures + f] = 0; 00043 } 00044 00045 // Clusters are initially empty. 00046 CHat.Add(TIntSet()); 00047 } 00048 } 00049 ~TCluster() { 00050 delete[] Theta; 00051 delete[] Derivative; 00052 } 00053 00059 void Train(TInt OuterReps, TInt GradientReps, TInt MCMCReps); 00060 00064 TVec<TIntSet> GetCircles(void) { 00065 return CHat; 00066 } 00067 TCRef CRef; 00068 00069 private: 00070 TFlt* Theta; // Community parameters 00071 TFlt* Derivative; // Partial derivatives 00072 00073 TVec<TIntSet> CHat; // Latent community memberships 00074 PGraphAttributes GraphAttributes; // Graph with attributes 00075 00076 TInt K; 00077 TFlt Lambda; 00078 00082 TFlt LogLikelihood(); 00083 00089 TIntSet MCMC(TInt k, TInt MCMCReps); 00090 void Gradient(); 00091 }; 00092 00093 typedef TPt<TCluster> PCluster; 00094 00095 enum lossType 00096 { 00097 zeroOne = 0, 00098 balancedError = 1, 00099 fScore = 2 00100 }; 00101 00103 TFlt Loss(TIntSet& l, TIntSet lHat, int N, int Which) 00104 { 00105 if (l.Len() == 0) { 00106 if (lHat.Len() == 0) { 00107 return 0; 00108 } 00109 return 1.0; 00110 } 00111 if (lHat.Len() == 0) { 00112 if (l.Len() == 0) { 00113 return 0; 00114 } 00115 return 1.0; 00116 } 00117 TInt TruePositives = 0; 00118 TInt FalsePositives = 0; 00119 TInt FalseNegatives = 0; 00120 00121 TFlt LabelLoss = 0; 00122 for (THashSetKeyI<TInt> it = l.BegI(); it != l.EndI(); it ++) { 00123 int c = it.GetKey(); 00124 if (not lHat.IsKey(c)) { 00125 // false negative 00126 FalseNegatives ++; 00127 if (Which == zeroOne) { 00128 LabelLoss += 1.0/N; 00129 } 00130 else if (Which == balancedError) { 00131 LabelLoss += 0.5/l.Len(); 00132 } 00133 } 00134 } 00135 00136 for (THashSetKeyI<TInt> it = lHat.BegI(); it != lHat.EndI(); it ++) { 00137 int c = it.GetKey(); 00138 if (not l.IsKey(c)) { 00139 // false positive 00140 FalsePositives ++; 00141 if (Which == zeroOne) { 00142 LabelLoss += 1.0/N; 00143 } 00144 else if (Which == balancedError) { 00145 LabelLoss += 0.5/(N - l.Len()); 00146 } 00147 } 00148 else { 00149 TruePositives ++; 00150 } 00151 } 00152 00153 if ((lHat.Len() == 0 or TruePositives == 0) and Which == fScore) { 00154 return 1.0; 00155 } 00156 TFlt precision = (1.0*TruePositives)/lHat.Len(); 00157 TFlt recall = (1.0*TruePositives)/l.Len(); 00158 if (Which == fScore) { 00159 return 1 - 2 * (precision*recall) / (precision + recall); 00160 } 00161 00162 return LabelLoss; 00163 } 00164 00165 /* 00167 // Requires code for the munkres algorithm, available at https://github.com/saebyn/munkres-cpp 00168 TFlt TotalLoss(TVec<TIntSet>& Clusters, TVec<TIntSet>& CHat, int N, int Which) 00169 { 00170 Matrix<double> matrix(Clusters.Len(), CHat.Len()); 00171 00172 for (int i = 0; i < (int) Clusters.Len(); i ++) { 00173 for (int j = 0; j < (int) CHat.Len(); j ++) { 00174 matrix(i,j) = Loss(Clusters[i], CHat[j], N, Which); 00175 } 00176 } 00177 00178 Munkres m; 00179 m.solve(matrix); 00180 00181 TFlt l = 0; 00182 for (int i = 0; i < (int) Clusters.Len(); i ++) { 00183 for (int j = 0; j < (int) CHat.Len(); j ++) { 00184 if (matrix(i,j) == 0) { 00185 l += Loss(Clusters[i], CHat[j], N, Which); 00186 } 00187 } 00188 } 00189 00190 return l / (Clusters.Len() < CHat.Len() ? Clusters.Len() : CHat.Len()); 00191 } 00192 */ 00193 00194 TGraphAttributes::TGraphAttributes(PUNGraph G, const char* NodeFeaturePath, 00195 const char* GroundTruthPath) : 00196 G(G) { 00197 // Read node Features for the graph 00198 FILE* f = fopen(NodeFeaturePath, "r"); 00199 int NNodes; 00200 int nF; 00201 fscanf(f, "%d %d", &NNodes, &nF); 00202 NFeatures = nF; 00203 00204 for (int i = 0; i < NNodes; i++) { 00205 int nid; 00206 fscanf(f, "%d", &nid); 00207 00208 if (not G->IsNode(nid)) { 00209 printf("Warning: %d is not a node in G.\n", nid); 00210 } 00211 TInt kv = NodeFeatures.AddKey(nid); 00212 for (int x = 0; x < nF; x++) { 00213 int z = 0; 00214 fscanf(f, "%d", &z); 00215 if (z) { 00216 NodeFeatures[kv].AddDat(x) = z; 00217 } 00218 } 00219 if (G->IsNode(nid)) { 00220 NodeIDs.Add(nid); 00221 } 00222 } 00223 fclose(f); 00224 00225 f = fopen(GroundTruthPath, "r"); 00226 if (f == NULL) { 00227 printf("Groundtruth file %s not found.\n", GroundTruthPath); 00228 } 00229 else { 00230 char* CircleName = new char [1000]; 00231 while (fscanf(f, "%s", CircleName) == 1) 00232 { 00233 TIntSet Circle; 00234 while (true) { 00235 int nid; 00236 fscanf(f, "%d", &nid); 00237 Circle.AddKey(nid); 00238 char c; 00239 while (true) { 00240 c = fgetc(f); 00241 if (c == '\n') break; 00242 if (c >= '0' and c <= '9') { 00243 fseek(f, -1, SEEK_CUR); 00244 break; 00245 } 00246 } 00247 if (c == '\n') break; 00248 } 00249 GroundTruth.Add(Circle); 00250 } 00251 delete [] CircleName; 00252 } 00253 fclose(f); 00254 00255 // Construct the Features encoding the difference between node attributes. 00256 for (int i = 0; i < NodeIDs.Len(); i++) { 00257 TInt ni = NodeIDs[i]; 00258 for (int j = i + 1; j < NodeIDs.Len(); j++) { 00259 TInt nj = NodeIDs[j]; 00260 TInt kv = EdgeFeatures.AddKey(TIntPr(ni, nj)); 00261 for (THashKeyDatI<TInt, TInt> it = NodeFeatures.GetDat(ni).BegI(); 00262 not it.IsEnd(); it++) { 00263 TInt k = it.GetKey(); 00264 TInt diff = 0; 00265 if (NodeFeatures.GetDat(nj).IsKey(k)) { 00266 diff = abs(it.GetDat() - NodeFeatures.GetDat(nj).GetDat(k)); 00267 } else { 00268 diff = abs(it.GetDat()); 00269 } 00270 if (diff) { 00271 EdgeFeatures[kv].AddDat(k) = diff; 00272 } 00273 } 00274 for (THashKeyDatI<TInt, TInt> it = NodeFeatures.GetDat(nj).BegI(); 00275 not it.IsEnd(); it++) { 00276 TInt k = it.GetKey(); 00277 TInt diff = 0; 00278 if (NodeFeatures.GetDat(ni).IsKey(k)) { 00279 diff = abs(it.GetDat() - NodeFeatures.GetDat(ni).GetDat(k)); 00280 } else { 00281 diff = abs(it.GetDat()); 00282 } 00283 if (diff) { 00284 EdgeFeatures[kv].AddDat(k) = diff; 00285 } 00286 } 00287 } 00288 } 00289 } 00290 00292 void TCluster::Train(TInt OuterReps, TInt GradientReps, TInt MCMCReps) { 00293 // Learning rate 00294 TFlt Increment = 1.0 / (1.0 * GraphAttributes->NodeIDs.Len() * GraphAttributes->NodeIDs.Len()); 00295 TRnd t; 00296 00297 for (int OuterRep = 0; OuterRep < OuterReps; OuterRep++) { 00298 // If it's the first iteration or the solution is degenerate, 00299 // randomly initialize the weights and Clusters 00300 for (int k = 0; k < K; k++) { 00301 if (OuterRep == 0 or CHat[k].Empty() or CHat[k].Len() 00302 == GraphAttributes->NodeIDs.Len()) { 00303 CHat[k].Clr(); 00304 for (int i = 0; i < GraphAttributes->NodeIDs.Len(); i++) { 00305 if (t.GetUniDevInt(2) == 0) { 00306 CHat[k].AddKey(GraphAttributes->NodeIDs[i]); 00307 } 00308 } 00309 for (int i = 0; i < GraphAttributes->NFeatures; i++) { 00310 Theta[k * GraphAttributes->NFeatures + i] = 0; 00311 } 00312 // Just set a single Feature to 1 as a random initialization. 00313 Theta[k * GraphAttributes->NFeatures + t.GetUniDevInt(GraphAttributes->NFeatures)] = 1.0; 00314 Theta[k * GraphAttributes->NFeatures] = 1; 00315 } 00316 } 00317 00318 for (int k = 0; k < K; k++) { 00319 CHat[k] = MCMC(k, MCMCReps); 00320 } 00321 TFlt llPrevious = LogLikelihood(); 00322 00323 // Perform gradient ascent 00324 TFlt ll = 0; 00325 for (int gradientRep = 0; gradientRep < GradientReps; gradientRep++) { 00326 Gradient(); 00327 for (int i = 0; i < K * GraphAttributes->NFeatures; i++) { 00328 Theta[i] += Increment * Derivative[i]; 00329 } 00330 printf("."); 00331 fflush( stdout); 00332 ll = LogLikelihood(); 00333 00334 // If we reduced the objective, undo the update and stop. 00335 if (ll < llPrevious) { 00336 for (int i = 0; i < K * GraphAttributes->NFeatures; i++) { 00337 Theta[i] -= Increment * Derivative[i]; 00338 } 00339 ll = llPrevious; 00340 break; 00341 } 00342 llPrevious = ll; 00343 } 00344 printf("\nIteration %d, ll = %f\n", OuterRep + 1, (double) ll); 00345 } 00346 } 00347 00349 TFlt Inner(TIntIntH& Feature, TFlt* Parameter) { 00350 TFlt res = 0; 00351 for (THashKeyDatI<TInt, TInt> it = Feature.BegI(); not it.IsEnd(); it++) { 00352 res += it.GetDat() * Parameter[it.GetKey()]; 00353 } 00354 return res; 00355 } 00356 00358 TIntSet TCluster::MCMC(TInt k, TInt MCMCReps) { 00359 TRnd t; 00360 00361 THash<TInt, TFlt> CostNotIncludeHash; 00362 THash<TInt, TFlt> CostIncludeHash; 00363 00364 TVec<TInt> NewLabel; 00365 int csize = 0; 00366 for (int i = 0; i < GraphAttributes->NodeIDs.Len(); i++) { 00367 if (CHat[k].IsKey(GraphAttributes->NodeIDs[i])) { 00368 NewLabel.Add(0); 00369 } else { 00370 NewLabel.Add(1); 00371 } 00372 if (CHat[k].IsKey(GraphAttributes->NodeIDs[i])) { 00373 csize++; 00374 } 00375 } 00376 // Compute edge log-probabilities. 00377 for (THashKeyDatI<TIntPr, TIntIntH> it = GraphAttributes->EdgeFeatures.BegI(); 00378 not it.IsEnd(); it++) { 00379 TIntPr e = it.GetKey(); 00380 TInt kv = GraphAttributes->EdgeFeatures.GetKeyId(e); 00381 TInt Src = e.Val1; 00382 TInt Dst = e.Val2; 00383 00384 TBool Exists = GraphAttributes->G->IsEdge(Src, Dst); 00385 TFlt InnerProduct = Inner(it.GetDat(), Theta + k * GraphAttributes->NFeatures); 00386 TFlt Other = 0; 00387 for (int l = 0; l < K; l++) { 00388 if (l == k) { 00389 continue; 00390 } 00391 TFlt d = (CHat[l].IsKey(Src) and CHat[l].IsKey(Dst)) ? 1 : -1; 00392 Other += d * Inner(it.GetDat(), Theta + l * GraphAttributes->NFeatures); 00393 } 00394 00395 TFlt CostNotInclude; 00396 TFlt CostInclude; 00397 00398 if (Exists) { 00399 CostNotInclude = -Other + InnerProduct + log(1 + exp(Other - InnerProduct)); 00400 CostInclude = -Other - InnerProduct + log(1 + exp(Other + InnerProduct)); 00401 } else { 00402 CostNotInclude = log(1 + exp(Other - InnerProduct)); 00403 CostInclude = log(1 + exp(Other + InnerProduct)); 00404 } 00405 00406 CostNotIncludeHash.AddDat(kv) = -CostNotInclude; 00407 CostIncludeHash.AddDat(kv) = -CostInclude; 00408 } 00409 00410 // Run MCMC using precomputed probabilities 00411 TFlt InitialTemperature = 1.0; // Initial temperature 00412 for (int r = 2; r < MCMCReps + 2; r++) { 00413 TFlt Temperature = InitialTemperature / log(r); 00414 for (int n = 0; n < GraphAttributes->NodeIDs.Len(); n++) { 00415 TFlt l0 = 0; 00416 TFlt l1 = 0; 00417 for (int np = 0; np < GraphAttributes->NodeIDs.Len(); np++) { 00418 if (n == np) { 00419 continue; 00420 } 00421 TIntPr ed(GraphAttributes->NodeIDs[n], GraphAttributes->NodeIDs[np]); 00422 if (ed.Val1 > ed.Val2) { 00423 ed = TIntPr(ed.Val2, ed.Val1); 00424 } 00425 TInt kv = GraphAttributes->EdgeFeatures.GetKeyId(ed); 00426 TFlt m0 = CostNotIncludeHash.GetDat(kv); 00427 if (NewLabel[np] == 0) { 00428 l0 += m0; 00429 l1 += m0; 00430 } else { 00431 l0 += m0; 00432 l1 += CostIncludeHash.GetDat(kv); 00433 } 00434 } 00435 TFlt LogLikelihoodDiff = exp(l1 - l0); 00436 TFlt AcceptProb = pow(LogLikelihoodDiff, 1.0 / Temperature); 00437 if (t.GetUniDev() < AcceptProb) { 00438 NewLabel[n] = 1; 00439 } else { 00440 NewLabel[n] = 0; 00441 } 00442 } 00443 } 00444 00445 TIntSet Result; 00446 for (int i = 0; i < GraphAttributes->NodeIDs.Len(); i++) { 00447 if (NewLabel[i]) { 00448 Result.AddKey(GraphAttributes->NodeIDs[i]); 00449 } 00450 } 00451 00452 return Result; 00453 } 00454 00456 void TCluster::Gradient(void) { 00457 for (int i = 0; i < K * GraphAttributes->NFeatures; i++) { 00458 if (Theta[i] > 0) { 00459 Derivative[i] = -Lambda * Theta[i]; 00460 } else { 00461 Derivative[i] = Lambda * Theta[i]; 00462 } 00463 } 00464 00465 for (THashKeyDatI<TIntPr, TIntIntH> it = GraphAttributes->EdgeFeatures.BegI(); 00466 not it.IsEnd(); it++) { 00467 TFlt InnerProduct = 0; 00468 TIntPr Edge = it.GetKey(); 00469 TInt Src = Edge.Val1; 00470 TInt Dst = Edge.Val2; 00471 TBool Exists = GraphAttributes->G->IsEdge(Src, Dst); 00472 for (int k = 0; k < K; k++) { 00473 TFlt d = CHat[k].IsKey(Src) and CHat[k].IsKey(Dst) ? 1 : -1; 00474 InnerProduct += d * Inner(it.GetDat(), Theta + k * GraphAttributes->NFeatures); 00475 } 00476 TFlt expinp = exp(InnerProduct); 00477 TFlt q = expinp / (1 + expinp); 00478 if (q != q) { 00479 q = 1; // Test for nan in case of overflow. 00480 } 00481 00482 for (int k = 0; k < K; k++) { 00483 TBool d_ = CHat[k].IsKey(Src) and CHat[k].IsKey(Dst); 00484 TFlt d = d_ ? 1 : -1; 00485 for (THashKeyDatI<TInt, TInt> itf = it.GetDat().BegI(); 00486 not itf.IsEnd(); itf++) { 00487 TInt i = itf.GetKey(); 00488 TInt f = itf.GetDat(); 00489 if (Exists) { 00490 Derivative[k * GraphAttributes->NFeatures + i] += d * f; 00491 } 00492 Derivative[k * GraphAttributes->NFeatures + i] += -d * f * q; 00493 } 00494 } 00495 } 00496 } 00497 00499 TFlt TCluster::LogLikelihood(void) { 00500 TFlt ll = 0; 00501 for (THashKeyDatI<TIntPr, TIntIntH> it = GraphAttributes->EdgeFeatures.BegI(); 00502 not it.IsEnd(); it++) { 00503 TFlt InnerProduct = 0; 00504 TIntPr Edge = it.GetKey(); 00505 TInt Src = Edge.Val1; 00506 TInt Dst = Edge.Val2; 00507 TBool Exists = GraphAttributes->G->IsEdge(Src, Dst); 00508 00509 for (int k = 0; k < K; k++) { 00510 TFlt d = CHat[k].IsKey(Src) and CHat[k].IsKey(Dst) ? 1 : -1; 00511 InnerProduct += d * Inner(it.GetDat(), Theta + k * GraphAttributes->NFeatures); 00512 } 00513 if (Exists) { 00514 ll += InnerProduct; 00515 } 00516 TFlt ll_ = log(1 + exp(InnerProduct)); 00517 ll += -ll_; 00518 } 00519 00520 if (ll != ll) { 00521 printf("ll isnan\n"); 00522 exit(1); 00523 } 00524 return ll; 00525 }