SNAP Library 2.0, Developer Reference  2013-05-13 16:33:57
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
circles.h
Go to the documentation of this file.
00001 #pragma once
00002 
00003 #include "stdafx.h"
00004 
00005 class TGraphAttributes {
00006 public:
00011   TGraphAttributes(PUNGraph G, const char* nodeFeaturePath, const char* groundtruthPath);
00012   ~TGraphAttributes() {
00013   }
00014 
00015   PUNGraph G;
00016   TInt NFeatures;
00017 
00018   THash<TInt, TIntIntH> NodeFeatures;
00019   THash<TIntPr, TIntIntH> EdgeFeatures;
00020   TVec<TInt> NodeIDs;
00021   TCRef CRef;
00022 
00023   TVec<TIntSet> GroundTruth; // Groundtruth communities
00024 };
00025 
00026 typedef TPt<TGraphAttributes> PGraphAttributes;
00027 
00028 class TCluster {
00029 public:
00035   TCluster(PGraphAttributes GraphAttributes, TInt K, TFlt Lambda) :
00036     GraphAttributes(GraphAttributes), K(K), Lambda(Lambda) {
00037     Theta = new TFlt[K * GraphAttributes->NFeatures];
00038     Derivative = new TFlt[K * GraphAttributes->NFeatures];
00039     for (int k = 0; k < K; k++) {
00040       for (int f = 0; f < GraphAttributes->NFeatures; f++) {
00041         Theta[k * GraphAttributes->NFeatures + f] = 0;
00042         Derivative[k * GraphAttributes->NFeatures + f] = 0;
00043       }
00044 
00045       // Clusters are initially empty.
00046       CHat.Add(TIntSet());
00047     }
00048   }
00049   ~TCluster() {
00050     delete[] Theta;
00051     delete[] Derivative;
00052   }
00053 
00059   void Train(TInt OuterReps, TInt GradientReps, TInt MCMCReps);
00060 
00064   TVec<TIntSet> GetCircles(void) {
00065     return CHat;
00066   }
00067   TCRef CRef;
00068 
00069 private:
00070   TFlt* Theta; // Community parameters
00071   TFlt* Derivative; // Partial derivatives
00072 
00073   TVec<TIntSet> CHat; // Latent community memberships
00074   PGraphAttributes GraphAttributes; // Graph with attributes
00075 
00076   TInt K;
00077   TFlt Lambda;
00078 
00082   TFlt LogLikelihood();
00083 
00089   TIntSet MCMC(TInt k, TInt MCMCReps);
00090   void Gradient();
00091 };
00092 
00093 typedef TPt<TCluster> PCluster;
00094 
00095 enum lossType
00096 {
00097   zeroOne = 0,
00098   balancedError = 1,
00099   fScore = 2
00100 };
00101 
00103 TFlt Loss(TIntSet& l, TIntSet lHat, int N, int Which)
00104 {
00105   if (l.Len() == 0) {
00106     if (lHat.Len() == 0) {
00107       return 0;
00108     }
00109     return 1.0;
00110   }
00111   if (lHat.Len() == 0) {
00112     if (l.Len() == 0) {
00113       return 0;
00114     }
00115     return 1.0;
00116   }
00117   TInt TruePositives = 0;
00118   TInt FalsePositives = 0;
00119   TInt FalseNegatives = 0;
00120 
00121   TFlt LabelLoss = 0;
00122   for (THashSetKeyI<TInt> it = l.BegI(); it != l.EndI(); it ++) {
00123     int c = it.GetKey();
00124     if (not lHat.IsKey(c)) {
00125       // false negative
00126       FalseNegatives ++;
00127       if (Which == zeroOne) {
00128         LabelLoss += 1.0/N;
00129       }
00130       else if (Which == balancedError) {
00131         LabelLoss += 0.5/l.Len();
00132       }
00133     }
00134   }
00135 
00136   for (THashSetKeyI<TInt> it = lHat.BegI(); it != lHat.EndI(); it ++) {
00137     int c = it.GetKey();
00138     if (not l.IsKey(c)) {
00139       // false positive
00140       FalsePositives ++;
00141       if (Which == zeroOne) {
00142         LabelLoss += 1.0/N;
00143       }
00144       else if (Which == balancedError) {
00145         LabelLoss += 0.5/(N - l.Len());
00146       }
00147     }
00148     else {
00149       TruePositives ++;
00150     }
00151   }
00152 
00153   if ((lHat.Len() == 0 or TruePositives == 0) and Which == fScore) {
00154     return 1.0;
00155   }
00156   TFlt precision = (1.0*TruePositives)/lHat.Len();
00157   TFlt recall = (1.0*TruePositives)/l.Len();
00158   if (Which == fScore) {
00159     return 1 - 2 * (precision*recall) / (precision + recall);
00160   }
00161 
00162   return LabelLoss;
00163 }
00164 
00165 /*
00167 // Requires code for the munkres algorithm, available at https://github.com/saebyn/munkres-cpp
00168 TFlt TotalLoss(TVec<TIntSet>& Clusters, TVec<TIntSet>& CHat, int N, int Which)
00169 {
00170   Matrix<double> matrix(Clusters.Len(), CHat.Len());
00171 
00172   for (int i = 0; i < (int) Clusters.Len(); i ++) {
00173     for (int j = 0; j < (int) CHat.Len(); j ++) {
00174       matrix(i,j) = Loss(Clusters[i], CHat[j], N, Which);
00175     }
00176   }
00177 
00178   Munkres m;
00179   m.solve(matrix);
00180 
00181   TFlt l = 0;
00182   for (int i = 0; i < (int) Clusters.Len(); i ++) {
00183     for (int j = 0; j < (int) CHat.Len(); j ++) {
00184       if (matrix(i,j) == 0) {
00185         l += Loss(Clusters[i], CHat[j], N, Which);
00186       }
00187     }
00188   }
00189 
00190   return l / (Clusters.Len() < CHat.Len() ? Clusters.Len() : CHat.Len());
00191 }
00192 */
00193 
00194 TGraphAttributes::TGraphAttributes(PUNGraph G, const char* NodeFeaturePath,
00195                                    const char* GroundTruthPath) :
00196   G(G) {
00197   // Read node Features for the graph
00198   FILE* f = fopen(NodeFeaturePath, "r");
00199   int NNodes;
00200   int nF;
00201   fscanf(f, "%d %d", &NNodes, &nF);
00202   NFeatures = nF;
00203 
00204   for (int i = 0; i < NNodes; i++) {
00205     int nid;
00206     fscanf(f, "%d", &nid);
00207 
00208     if (not G->IsNode(nid)) {
00209       printf("Warning: %d is not a node in G.\n", nid);
00210     }
00211     TInt kv = NodeFeatures.AddKey(nid);
00212     for (int x = 0; x < nF; x++) {
00213       int z = 0;
00214       fscanf(f, "%d", &z);
00215       if (z) {
00216         NodeFeatures[kv].AddDat(x) = z;
00217       }
00218     }
00219     if (G->IsNode(nid)) {
00220       NodeIDs.Add(nid);
00221     }
00222   }
00223   fclose(f);
00224   
00225   f = fopen(GroundTruthPath, "r");
00226   if (f == NULL) {
00227     printf("Groundtruth file %s not found.\n", GroundTruthPath);
00228   }
00229   else {
00230     char* CircleName = new char [1000];
00231     while (fscanf(f, "%s", CircleName) == 1)
00232     {
00233       TIntSet Circle;
00234       while (true) {
00235         int nid;
00236         fscanf(f, "%d", &nid);
00237         Circle.AddKey(nid);
00238         char c;
00239         while (true) {          
00240           c = fgetc(f);
00241           if (c == '\n') break;
00242           if (c >= '0' and c <= '9') {
00243             fseek(f, -1, SEEK_CUR);
00244             break;
00245           }
00246         }
00247         if (c == '\n') break;
00248       }
00249       GroundTruth.Add(Circle);
00250     }
00251     delete [] CircleName;
00252   }
00253   fclose(f);
00254 
00255   // Construct the Features encoding the difference between node attributes.
00256   for (int i = 0; i < NodeIDs.Len(); i++) {
00257     TInt ni = NodeIDs[i];
00258     for (int j = i + 1; j < NodeIDs.Len(); j++) {
00259       TInt nj = NodeIDs[j];
00260       TInt kv = EdgeFeatures.AddKey(TIntPr(ni, nj));
00261       for (THashKeyDatI<TInt, TInt> it = NodeFeatures.GetDat(ni).BegI();
00262            not it.IsEnd(); it++) {
00263         TInt k = it.GetKey();
00264         TInt diff = 0;
00265         if (NodeFeatures.GetDat(nj).IsKey(k)) {
00266           diff = abs(it.GetDat() - NodeFeatures.GetDat(nj).GetDat(k));
00267         } else {
00268           diff = abs(it.GetDat());
00269         }
00270         if (diff) {
00271           EdgeFeatures[kv].AddDat(k) = diff;
00272         }
00273       }
00274       for (THashKeyDatI<TInt, TInt> it = NodeFeatures.GetDat(nj).BegI();
00275            not it.IsEnd(); it++) {
00276         TInt k = it.GetKey();
00277         TInt diff = 0;
00278         if (NodeFeatures.GetDat(ni).IsKey(k)) {
00279           diff = abs(it.GetDat() - NodeFeatures.GetDat(ni).GetDat(k));
00280         } else {
00281           diff = abs(it.GetDat());
00282         }
00283         if (diff) {
00284           EdgeFeatures[kv].AddDat(k) = diff;
00285         }
00286       }
00287     }
00288   }
00289 }
00290 
00292 void TCluster::Train(TInt OuterReps, TInt GradientReps, TInt MCMCReps) {
00293   // Learning rate
00294   TFlt Increment = 1.0 / (1.0 * GraphAttributes->NodeIDs.Len() * GraphAttributes->NodeIDs.Len());
00295   TRnd t;
00296 
00297   for (int OuterRep = 0; OuterRep < OuterReps; OuterRep++) {
00298     // If it's the first iteration or the solution is degenerate,
00299     // randomly initialize the weights and Clusters
00300     for (int k = 0; k < K; k++) {
00301       if (OuterRep == 0 or CHat[k].Empty() or CHat[k].Len()
00302           == GraphAttributes->NodeIDs.Len()) {
00303         CHat[k].Clr();
00304         for (int i = 0; i < GraphAttributes->NodeIDs.Len(); i++) {
00305           if (t.GetUniDevInt(2) == 0) {
00306             CHat[k].AddKey(GraphAttributes->NodeIDs[i]);
00307           }
00308         }
00309         for (int i = 0; i < GraphAttributes->NFeatures; i++) {
00310           Theta[k * GraphAttributes->NFeatures + i] = 0;
00311         }
00312         // Just set a single Feature to 1 as a random initialization.
00313         Theta[k * GraphAttributes->NFeatures + t.GetUniDevInt(GraphAttributes->NFeatures)] = 1.0;
00314         Theta[k * GraphAttributes->NFeatures] = 1;
00315       }
00316     }
00317     
00318     for (int k = 0; k < K; k++) {
00319       CHat[k] = MCMC(k, MCMCReps);
00320     }
00321     TFlt llPrevious = LogLikelihood();
00322 
00323     // Perform gradient ascent
00324     TFlt ll = 0;
00325     for (int gradientRep = 0; gradientRep < GradientReps; gradientRep++) {
00326       Gradient();
00327       for (int i = 0; i < K * GraphAttributes->NFeatures; i++) {
00328         Theta[i] += Increment * Derivative[i];
00329       }
00330       printf(".");
00331       fflush( stdout);
00332       ll = LogLikelihood();
00333 
00334       // If we reduced the objective, undo the update and stop.
00335       if (ll < llPrevious) {
00336         for (int i = 0; i < K * GraphAttributes->NFeatures; i++) {
00337           Theta[i] -= Increment * Derivative[i];
00338         }
00339         ll = llPrevious;
00340         break;
00341       }
00342       llPrevious = ll;
00343     }
00344     printf("\nIteration %d, ll = %f\n", OuterRep + 1, (double) ll);
00345   }
00346 }
00347 
00349 TFlt Inner(TIntIntH& Feature, TFlt* Parameter) {
00350   TFlt res = 0;
00351   for (THashKeyDatI<TInt, TInt> it = Feature.BegI(); not it.IsEnd(); it++) {
00352     res += it.GetDat() * Parameter[it.GetKey()];
00353   }
00354   return res;
00355 }
00356 
00358 TIntSet TCluster::MCMC(TInt k, TInt MCMCReps) {
00359   TRnd t;
00360 
00361   THash<TInt, TFlt> CostNotIncludeHash;
00362   THash<TInt, TFlt> CostIncludeHash;
00363 
00364   TVec<TInt> NewLabel;
00365   int csize = 0;
00366   for (int i = 0; i < GraphAttributes->NodeIDs.Len(); i++) {
00367     if (CHat[k].IsKey(GraphAttributes->NodeIDs[i])) {
00368       NewLabel.Add(0);
00369     } else {
00370       NewLabel.Add(1);
00371     }
00372     if (CHat[k].IsKey(GraphAttributes->NodeIDs[i])) {
00373       csize++;
00374     }
00375   }
00376   // Compute edge log-probabilities.
00377   for (THashKeyDatI<TIntPr, TIntIntH> it = GraphAttributes->EdgeFeatures.BegI();
00378        not it.IsEnd(); it++) {
00379     TIntPr e = it.GetKey();
00380     TInt kv = GraphAttributes->EdgeFeatures.GetKeyId(e);
00381     TInt Src = e.Val1;
00382     TInt Dst = e.Val2;
00383 
00384     TBool Exists = GraphAttributes->G->IsEdge(Src, Dst);
00385     TFlt InnerProduct = Inner(it.GetDat(), Theta + k * GraphAttributes->NFeatures);
00386     TFlt Other = 0;
00387     for (int l = 0; l < K; l++) {
00388       if (l == k) {
00389         continue;
00390       }
00391       TFlt d = (CHat[l].IsKey(Src) and CHat[l].IsKey(Dst)) ? 1 : -1;
00392       Other += d * Inner(it.GetDat(), Theta + l * GraphAttributes->NFeatures);
00393     }
00394 
00395     TFlt CostNotInclude;
00396     TFlt CostInclude;
00397 
00398     if (Exists) {
00399       CostNotInclude = -Other + InnerProduct + log(1 + exp(Other - InnerProduct));
00400       CostInclude = -Other - InnerProduct + log(1 + exp(Other + InnerProduct));
00401     } else {
00402       CostNotInclude = log(1 + exp(Other - InnerProduct));
00403       CostInclude = log(1 + exp(Other + InnerProduct));
00404     }
00405 
00406     CostNotIncludeHash.AddDat(kv) = -CostNotInclude;
00407     CostIncludeHash.AddDat(kv) = -CostInclude;
00408   }
00409 
00410   // Run MCMC using precomputed probabilities
00411   TFlt InitialTemperature = 1.0; // Initial temperature
00412   for (int r = 2; r < MCMCReps + 2; r++) {
00413     TFlt Temperature = InitialTemperature / log(r);
00414     for (int n = 0; n < GraphAttributes->NodeIDs.Len(); n++) {
00415       TFlt l0 = 0;
00416       TFlt l1 = 0;
00417       for (int np = 0; np < GraphAttributes->NodeIDs.Len(); np++) {
00418         if (n == np) {
00419           continue;
00420         }
00421         TIntPr ed(GraphAttributes->NodeIDs[n], GraphAttributes->NodeIDs[np]);
00422         if (ed.Val1 > ed.Val2) {
00423           ed = TIntPr(ed.Val2, ed.Val1);
00424         }
00425         TInt kv = GraphAttributes->EdgeFeatures.GetKeyId(ed);
00426         TFlt m0 = CostNotIncludeHash.GetDat(kv);
00427         if (NewLabel[np] == 0) {
00428           l0 += m0;
00429           l1 += m0;
00430         } else {
00431           l0 += m0;
00432           l1 += CostIncludeHash.GetDat(kv);
00433         }
00434       }
00435       TFlt LogLikelihoodDiff = exp(l1 - l0);
00436       TFlt AcceptProb = pow(LogLikelihoodDiff, 1.0 / Temperature);
00437       if (t.GetUniDev() < AcceptProb) {
00438         NewLabel[n] = 1;
00439       } else {
00440         NewLabel[n] = 0;
00441       }
00442     }
00443   }
00444 
00445   TIntSet Result;
00446   for (int i = 0; i < GraphAttributes->NodeIDs.Len(); i++) {
00447     if (NewLabel[i]) {
00448       Result.AddKey(GraphAttributes->NodeIDs[i]);
00449     }
00450   }
00451 
00452   return Result;
00453 }
00454 
00456 void TCluster::Gradient(void) {
00457   for (int i = 0; i < K * GraphAttributes->NFeatures; i++) {
00458     if (Theta[i] > 0) {
00459       Derivative[i] = -Lambda * Theta[i];
00460     } else {
00461       Derivative[i] = Lambda * Theta[i];
00462     }
00463   }
00464 
00465   for (THashKeyDatI<TIntPr, TIntIntH> it = GraphAttributes->EdgeFeatures.BegI();
00466        not it.IsEnd(); it++) {
00467     TFlt InnerProduct = 0;
00468     TIntPr Edge = it.GetKey();
00469     TInt Src = Edge.Val1;
00470     TInt Dst = Edge.Val2;
00471     TBool Exists = GraphAttributes->G->IsEdge(Src, Dst);
00472     for (int k = 0; k < K; k++) {
00473       TFlt d = CHat[k].IsKey(Src) and CHat[k].IsKey(Dst) ? 1 : -1;
00474       InnerProduct += d * Inner(it.GetDat(), Theta + k * GraphAttributes->NFeatures);
00475     }
00476     TFlt expinp = exp(InnerProduct);
00477     TFlt q = expinp / (1 + expinp);
00478     if (q != q) {
00479       q = 1; // Test for nan in case of overflow.
00480     }
00481 
00482     for (int k = 0; k < K; k++) {
00483       TBool d_ = CHat[k].IsKey(Src) and CHat[k].IsKey(Dst);
00484       TFlt d = d_ ? 1 : -1;
00485       for (THashKeyDatI<TInt, TInt> itf = it.GetDat().BegI();
00486            not itf.IsEnd(); itf++) {
00487         TInt i = itf.GetKey();
00488         TInt f = itf.GetDat();
00489         if (Exists) {
00490           Derivative[k * GraphAttributes->NFeatures + i] += d * f;
00491         }
00492         Derivative[k * GraphAttributes->NFeatures + i] += -d * f * q;
00493       }
00494     }
00495   }
00496 }
00497 
00499 TFlt TCluster::LogLikelihood(void) {
00500   TFlt ll = 0;
00501   for (THashKeyDatI<TIntPr, TIntIntH> it = GraphAttributes->EdgeFeatures.BegI();
00502        not it.IsEnd(); it++) {
00503     TFlt InnerProduct = 0;
00504     TIntPr Edge = it.GetKey();
00505     TInt Src = Edge.Val1;
00506     TInt Dst = Edge.Val2;
00507     TBool Exists = GraphAttributes->G->IsEdge(Src, Dst);
00508 
00509     for (int k = 0; k < K; k++) {
00510       TFlt d = CHat[k].IsKey(Src) and CHat[k].IsKey(Dst) ? 1 : -1;
00511       InnerProduct += d * Inner(it.GetDat(), Theta + k * GraphAttributes->NFeatures);
00512     }
00513     if (Exists) {
00514       ll += InnerProduct;
00515     }
00516     TFlt ll_ = log(1 + exp(InnerProduct));
00517     ll += -ll_;
00518   }
00519 
00520   if (ll != ll) {
00521     printf("ll isnan\n");
00522     exit(1);
00523   }
00524   return ll;
00525 }