SNAP Library 2.2, Developer Reference
2014-03-11 19:15:55
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 #ifndef yanglib_agmattr1_h 00002 #define yanglib_agmattr1_h 00003 #include "Snap.h" 00004 00005 class TCesnaUtil { 00006 public: 00007 //static double GetConductance(const PUNGraph& Graph, const TIntSet& CmtyS, const int Edges); 00008 //static double GetConductance(const PNGraph& Graph, const TIntSet& CmtyS, const int Edges); 00009 template<class PGraph> 00010 static double GetConductance(const PGraph& Graph, const TIntSet& CmtyS, const int Edges) { 00011 const bool GraphType = HasGraphFlag(typename PGraph::TObj, gfDirected); 00012 int Edges2; 00013 if (GraphType) { Edges2 = Edges >= 0 ? Edges : Graph->GetEdges(); } 00014 else { Edges2 = Edges >= 0 ? 2 * Edges : Graph->GetEdges(); } 00015 int Vol = 0, Cut = 0; 00016 double Phi = 0.0; 00017 for (int i = 0; i < CmtyS.Len(); i++) { 00018 if (! Graph->IsNode(CmtyS[i])) { continue; } 00019 typename PGraph::TObj::TNodeI NI = Graph->GetNI(CmtyS[i]); 00020 for (int e = 0; e < NI.GetOutDeg(); e++) { 00021 if (! CmtyS.IsKey(NI.GetOutNId(e))) { Cut += 1; } 00022 } 00023 Vol += NI.GetOutDeg(); 00024 } 00025 // get conductance 00026 if (Vol != Edges2) { 00027 if (2 * Vol > Edges2) { Phi = Cut / double (Edges2 - Vol); } 00028 else if (Vol == 0) { Phi = 0.0; } 00029 else { Phi = Cut / double(Vol); } 00030 } else { 00031 if (Vol == Edges2) { Phi = 1.0; } 00032 } 00033 return Phi; 00034 } 00035 00036 00037 template<class PGraph> 00038 static void GenHoldOutPairs(const PGraph& G, TVec<TIntSet>& HoldOutSet, double HOFrac, TRnd& Rnd) { 00039 TIntPrV EdgeV(G->GetEdges(), 0); 00040 for (typename PGraph::TObj::TEdgeI EI = G->BegEI(); EI < G->EndEI(); EI++) { 00041 EdgeV.Add(TIntPr(EI.GetSrcNId(), EI.GetDstNId())); 00042 } 00043 EdgeV.Shuffle(Rnd); 00044 00045 const bool GraphType = HasGraphFlag(typename PGraph::TObj, gfDirected); 00046 HoldOutSet.Gen(G->GetNodes()); 00047 int HOTotal = int(HOFrac * G->GetNodes() * (G->GetNodes() - 1) / 2.0); 00048 if (GraphType) { HOTotal *= 2;} 00049 int HOCnt = 0; 00050 int HOEdges = (int) TMath::Round(HOFrac * G->GetEdges()); 00051 printf("holding out %d edges...\n", HOEdges); 00052 for (int he = 0; he < (int) HOEdges; he++) { 00053 HoldOutSet[EdgeV[he].Val1].AddKey(EdgeV[he].Val2); 00054 if (! GraphType) { HoldOutSet[EdgeV[he].Val2].AddKey(EdgeV[he].Val1); } 00055 HOCnt++; 00056 } 00057 printf("%d Edges hold out\n", HOCnt); 00058 while(HOCnt++ < HOTotal) { 00059 int SrcNID = Rnd.GetUniDevInt(G->GetNodes()); 00060 int DstNID = Rnd.GetUniDevInt(G->GetNodes()); 00061 if (SrcNID == DstNID) { continue; } 00062 HoldOutSet[SrcNID].AddKey(DstNID); 00063 if (! GraphType) { HoldOutSet[DstNID].AddKey(SrcNID); } 00064 } 00065 } 00066 00067 template<class PGraph> 00068 static void GetNbhCom(const PGraph& Graph, const int NID, TIntSet& NBCmtyS) { 00069 typename PGraph::TObj::TNodeI NI = Graph->GetNI(NID); 00070 NBCmtyS.Gen(NI.GetDeg()); 00071 NBCmtyS.AddKey(NID); 00072 for (int e = 0; e < NI.GetDeg(); e++) { 00073 NBCmtyS.AddKey(NI.GetNbrNId(e)); 00074 } 00075 } 00076 template<class PGraph> 00077 static void GetNIdPhiV(const PGraph& G, TFltIntPrV& NIdPhiV) { 00078 NIdPhiV.Gen(G->GetNodes(), 0); 00079 const int Edges = G->GetEdges(); 00080 TExeTm RunTm; 00081 //compute conductance of neighborhood community 00082 for (typename PGraph::TObj::TNodeI NI = G->BegNI(); NI < G->EndNI(); NI++) { 00083 TIntSet NBCmty(NI.GetDeg() + 1); 00084 double Phi; 00085 if (NI.GetDeg() < 5) { //do not include nodes with too few degree 00086 Phi = 1.0; 00087 } else { 00088 TCesnaUtil::GetNbhCom<PGraph>(G, NI.GetId(), NBCmty); 00089 //if (NBCmty.Len() != NI.GetDeg() + 1) { printf("NbCom:%d, Deg:%d\n", NBCmty.Len(), NI.GetDeg()); } 00090 //IAssert(NBCmty.Len() == NI.GetDeg() + 1); 00091 Phi = TCesnaUtil::GetConductance(G, NBCmty, Edges); 00092 } 00093 //NCPhiH.AddDat(u, Phi); 00094 NIdPhiV.Add(TFltIntPr(Phi, NI.GetId())); 00095 } 00096 printf("conductance computation completed [%s]\n", RunTm.GetTmStr()); 00097 fflush(stdout); 00098 } 00099 00100 static void LoadNIDAttrHFromNIDKH(const TIntV& NIDV, const TStr& InFNm, THash<TInt, TIntV>& NIDAttrH, const TStrHash<TInt>& NodeNameH, const TSsFmt Sep = ssfTabSep) { 00101 NIDAttrH.Clr(); 00102 NIDAttrH.Gen(NIDV.Len()); 00103 for (int u = 0; u < NIDV.Len(); u++) { NIDAttrH.AddDat(NIDV[u]).Gen(0, 0); } 00104 TSsParser Ss(InFNm, ssfTabSep); 00105 while (Ss.Next()) { 00106 TStr NodeName = Ss.GetFld(0); 00107 TInt NID = NodeName.GetInt(); 00108 if (NodeNameH.Len() > 0 && ! NodeNameH.IsKey(NodeName)) { continue; } 00109 if (NodeNameH.Len() > 0) { 00110 IAssertR(NodeNameH.IsKey(NodeName), TStr::Fmt("NodeName:%s", NodeName.CStr())); 00111 NID = NodeNameH.GetKeyId(NodeName); 00112 } 00113 if (! NIDAttrH.IsKey(NID)) { continue; } //ignore nodes who are not in the graph 00114 IAssertR(! NIDAttrH.GetDat(NID).IsIn(Ss.GetInt(1)), TStr::Fmt("NIdx:%d NID:%s, K:%d", NID.Val, NodeName.CStr(), Ss.GetInt(1))); 00115 NIDAttrH.GetDat(NID).Add(Ss.GetInt(1)); 00116 } 00117 printf("%s nodes, %s lines read \n", TUInt64::GetStr(NIDAttrH.Len()).CStr(), TUInt64::GetStr(Ss.GetLineNo()).CStr()); 00118 //printf("%d nodes, %d lines read \n", NIDAttrH.Len(), Ss.GetLineNo()); 00119 } 00120 static void LoadNIDAttrHFromNIDKH(const TIntV& NIDV, const TStr& InFNm, THash<TInt, TIntV>& NIDAttrH) { 00121 TStrHash<TInt> TmpH; 00122 LoadNIDAttrHFromNIDKH(NIDV, InFNm, NIDAttrH, TmpH); 00123 } 00124 static void DumpNIDAttrHToNIDK(const TStr& FNm, const THash<TInt, TIntSet>& NIDAttrH, const TStrHash<TInt>& FeatNameH, const TStrHash<TInt>& NodeNameH) { 00125 FILE* F = fopen(FNm.CStr(), "wt"); 00126 for (int u = 0; u < NIDAttrH.Len(); u++) { 00127 int NID = NIDAttrH.GetKey(u); 00128 TStr NodeName = NodeNameH.IsKeyId(NID)? NodeNameH.GetKey(NID): TStr::Fmt("%d", NID); 00129 for (int k = 0; k < NIDAttrH[u].Len(); k++) { 00130 int KID = NIDAttrH[u][k]; 00131 TStr FeatName = FeatNameH.IsKeyId(KID)? FeatNameH.GetKey(KID): TStr::Fmt("%d", KID); 00132 fprintf(F,"%s\t%s\n", NodeName.CStr(), FeatName.CStr()); 00133 } 00134 } 00135 fclose(F); 00136 } 00137 static void DumpNIDAttrHToNIDK(const TStr& FNm, const THash<TInt, TIntSet>& NIDAttrH, const TStrHash<TInt>& FeatNameH) { 00138 TStrHash<TInt> TmpH; 00139 DumpNIDAttrHToNIDK(FNm, NIDAttrH, FeatNameH, TmpH); 00140 } 00141 static void DumpNIDAttrHToNIDK(const TStr& FNm, const THash<TInt, TIntSet>& NIDAttrH) { 00142 TStrHash<TInt> TmpH1, TmpH2; 00143 DumpNIDAttrHToNIDK(FNm, NIDAttrH, TmpH1, TmpH2); 00144 } 00145 static void DumpNIDAttrHToNIDK(const TStr& FNm, const THash<TInt, TIntV>& NIDAttrH, const TStrHash<TInt>& FeatNameH, const TStrHash<TInt>& NodeNameH) { 00146 FILE* F = fopen(FNm.CStr(), "wt"); 00147 for (int u = 0; u < NIDAttrH.Len(); u++) { 00148 int NID = NIDAttrH.GetKey(u); 00149 TStr NodeName = NodeNameH.IsKeyId(NID)? NodeNameH.GetKey(NID): TStr::Fmt("%d", NID); 00150 for (int k = 0; k < NIDAttrH[u].Len(); k++) { 00151 int KID = NIDAttrH[u][k]; 00152 TStr FeatName = FeatNameH.IsKeyId(KID)? FeatNameH.GetKey(KID): TStr::Fmt("%d", KID); 00153 fprintf(F,"%s\t%s\n", NodeName.CStr(), FeatName.CStr()); 00154 } 00155 } 00156 fclose(F); 00157 } 00158 static void DumpNIDAttrHToNIDK(const TStr& FNm, const THash<TInt, TIntV>& NIDAttrH, const TStrHash<TInt>& FeatNameH) { 00159 TStrHash<TInt> TmpH; 00160 DumpNIDAttrHToNIDK(FNm, NIDAttrH, FeatNameH, TmpH); 00161 } 00162 static void DumpNIDAttrHToNIDK(const TStr& FNm, const THash<TInt, TIntV>& NIDAttrH) { 00163 TStrHash<TInt> TmpH1, TmpH2; 00164 DumpNIDAttrHToNIDK(FNm, NIDAttrH, TmpH1, TmpH2); 00165 } 00166 static int GetAttrs(const THash<TInt, TIntV>& NIDAttrH) { 00167 int Attrs = 0; 00168 for (int u = 0; u < NIDAttrH.Len(); u++) { 00169 for (int k = 0; k < NIDAttrH[u].Len(); k++) { 00170 if (NIDAttrH[u][k] >= Attrs) { Attrs = NIDAttrH[u][k] + 1; } 00171 } 00172 } 00173 return Attrs; 00174 } 00175 //Metis format (N + 1) line describes the attributes of N. ID start from 1 00176 static void DumpNIDAttrHToMetis(const TStr& FNm, const THash<TInt, TIntV>& NIDAttrH, const TIntV& NIDV) { 00177 int AttrCnt = 0; 00178 for (int u = 1; u < NIDV.Len(); u++) { 00179 if (! NIDAttrH.IsKey(NIDV[u])) { continue; } 00180 AttrCnt += NIDAttrH.GetDat(NIDV[u]).Len(); 00181 } 00182 IAssert (NIDV[0] == -1); 00183 FILE* F = fopen(FNm.CStr(), "wt"); 00184 fprintf(F, "%d %d\n", NIDV.Len() - 1, AttrCnt); 00185 int TmpCnt = 0; 00186 for (int u = 1; u < NIDV.Len(); u++) { 00187 if (NIDAttrH.IsKey(NIDV[u])) { 00188 for (int k = 0; k < NIDAttrH.GetDat(NIDV[u]).Len(); k++) { 00189 if (k > 0) { fprintf(F, " "); } 00190 fprintf(F, "%d", NIDAttrH.GetDat(NIDV[u])[k].Val + 1); 00191 TmpCnt++; 00192 } 00193 } 00194 fprintf(F, "\n"); 00195 } 00196 fclose(F); 00197 IAssert(AttrCnt == TmpCnt); 00198 00199 } 00200 static void FilterLowEntropy(const THash<TInt, TIntV>& OldNIDAttrH, THash<TInt, TIntV>& NewNIDAttrH, const TIntStrH& OldNameH, TIntStrH& NewNameH, const double MinFrac = 0.00001, const double MaxFrac = 0.95, const int MinCnt = 3) { 00201 TIntH KIDCntH; 00202 for (int u = 0; u < OldNIDAttrH.Len(); u++) { 00203 for (int k = 0; k < OldNIDAttrH[u].Len(); k++) { 00204 KIDCntH.AddDat(OldNIDAttrH[u][k])++; 00205 } 00206 } 00207 KIDCntH.SortByDat(false); 00208 00209 TIntSet SelectedK(KIDCntH.Len()); 00210 for (int c = 0; c < KIDCntH.Len(); c++) { 00211 double Frac = (double) KIDCntH[c].Val / (double) OldNIDAttrH.Len(); 00212 if (KIDCntH[c].Val < MinCnt) { continue; } 00213 if (Frac > MaxFrac || Frac < MinFrac) { continue; } 00214 SelectedK.AddKey(KIDCntH.GetKey(c)); 00215 } 00216 printf("%d attributes selected from %d\n", SelectedK.Len(), KIDCntH.Len()); 00217 NewNIDAttrH.Gen(OldNIDAttrH.Len()); 00218 for (int u = 0; u < OldNIDAttrH.Len(); u++) { 00219 int NID = OldNIDAttrH.GetKey(u); 00220 TIntV& AttrV = NewNIDAttrH.AddDat(NID); 00221 for (int k = 0; k < OldNIDAttrH[u].Len(); k++) { 00222 if (! SelectedK.IsKey(OldNIDAttrH[u][k])) { continue; } 00223 AttrV.Add(SelectedK.GetKeyId(OldNIDAttrH[u][k])); 00224 } 00225 } 00226 00227 if (! OldNameH.Empty()) { 00228 NewNameH.Gen(SelectedK.Len()); 00229 for (int k = 0; k < SelectedK.Len(); k++) { 00230 int OldKID = SelectedK.GetKey(k); 00231 if (OldNameH.IsKey(OldKID)) { 00232 NewNameH.AddDat(k, OldNameH.GetDat(OldKID)); 00233 } 00234 } 00235 printf("%d attributes names copied\n", NewNameH.Len()); 00236 } 00237 } 00238 static void FilterLowEntropy(const THash<TInt, TIntV>& OldNIDAttrH, THash<TInt, TIntV>& NewNIDAttrH, const double MinFrac = 0.00001, const double MaxFrac = 0.95, const int MinCnt = 3) { 00239 TIntStrH TmpH1, TmpH2; 00240 FilterLowEntropy(OldNIDAttrH, NewNIDAttrH, TmpH1, TmpH2, MinFrac, MaxFrac, MinCnt); 00241 } 00242 }; 00243 class TCesna { //CESNA: community detection in networks with node attributes 00244 private: 00245 PUNGraph G; //graph to fit 00246 TVec<TIntSet> X; // X[u] = {k| X_uk = 1} 00247 TVec<TIntFltH> F; // membership for each user (Size: Nodes * Coms) 00248 TVec<TFltV> W; // weight vector for logistic regression. w_ck = W[k][c] (Column vector) 00249 TInt Attrs; // number of attributes 00250 TRnd Rnd; // random number generator 00251 TIntSet NIDToIdx; // original node ID vector NIDToIdx[i] = Node ID for index i, NIDToIdx.GetKey(NID) = index for NID 00252 TFlt RegCoef; //Regularization coefficient when we fit for P_c +: L1, -: L2 00253 TFltV SumFV; // sum_u F_uc for each community c. Needed for efficient calculation 00254 TInt NumComs; // number of communities 00255 TVec<TIntSet> HOVIDSV; //NID pairs to hold out for cross validation 00256 TVec<TIntSet> HOKIDSV; //set of attribute index (k) to hold out 00257 public: 00258 TFlt MinVal; // minimum value of F (0) 00259 TFlt MaxVal; // maximum value of F (for numerical reason) 00260 TFlt MinValW; // minimum value of W (for numerical reason) 00261 TFlt MaxValW; // maximum value of W (for numerical reason) 00262 TFlt NegWgt; // weight of negative example (a pair of nodes without an edge) 00263 TFlt LassoCoef; // L1 regularization coefficient for W (MLE = argmax P(X|F, W) - LassoCoef * |W|) 00264 TFlt WeightAttr; // likelihood = log P(G|F) + WeightAttr * log P(X|F, W) 00265 TFlt PNoCom; // base probability \varepsilon (edge probability between a pair of nodes sharing no community 00266 TBool DoParallel; // whether to use parallelism for computation 00267 00268 TCesna() { G = TUNGraph::New(10, -1); } 00269 TCesna(const PUNGraph& GraphPt, const THash<TInt, TIntV>& NIDAttrH, const int& InitComs, const int RndSeed = 0): Rnd(RndSeed), RegCoef(0), 00270 MinVal(0.0), MaxVal(10.0), MinValW(-10.0), MaxValW(10.0), NegWgt(1.0), LassoCoef(1.0), WeightAttr(1.0) { SetGraph(GraphPt, NIDAttrH); NeighborComInit(InitComs); } 00271 void Save(TSOut& SOut) { 00272 G->Save(SOut); 00273 X.Save(SOut); 00274 F.Save(SOut); 00275 W.Save(SOut); 00276 Attrs.Save(SOut); 00277 NIDToIdx.Save(SOut); 00278 RegCoef.Save(SOut); 00279 LassoCoef.Save(SOut); 00280 SumFV.Save(SOut); 00281 NumComs.Save(SOut); 00282 HOVIDSV.Save(SOut); 00283 HOKIDSV.Save(SOut); 00284 MinVal.Save(SOut); 00285 MaxVal.Save(SOut); 00286 MinValW.Save(SOut); 00287 MaxValW.Save(SOut); 00288 NegWgt.Save(SOut); 00289 PNoCom.Save(SOut); 00290 } 00291 void Load(TSIn& SIn, const int& RndSeed = 0) { 00292 G->Load(SIn); 00293 X.Load(SIn); 00294 F.Load(SIn); 00295 W.Load(SIn); 00296 Attrs.Load(SIn); 00297 NIDToIdx.Load(SIn); 00298 RegCoef.Load(SIn); 00299 LassoCoef.Load(SIn); 00300 SumFV.Load(SIn); 00301 NumComs.Load(SIn); 00302 HOVIDSV.Load(SIn); 00303 HOKIDSV.Load(SIn); 00304 MinVal.Load(SIn); 00305 MaxVal.Load(SIn); 00306 MinValW.Load(SIn); 00307 MaxValW.Load(SIn); 00308 NegWgt.Load(SIn); 00309 PNoCom.Load(SIn); 00310 } 00311 00312 void SetGraph(const PUNGraph& GraphPt, const THash<TInt, TIntV>& NIDAttrH); 00313 void SetRegCoef(const double _RegCoef) { RegCoef = _RegCoef; } 00314 double GetRegCoef() { return RegCoef; } 00315 void SetWeightAttr(const double _WeightAttr) { IAssert (_WeightAttr <= 1.0 && _WeightAttr >= 0.0); WeightAttr = _WeightAttr; } 00316 double GetWeightAttr() { return WeightAttr; } 00317 void SetLassoCoef(const double _LassoCoef) { LassoCoef = _LassoCoef; } 00318 int GetAttrs() { return Attrs; } 00319 double GetComFromNID(const int& NID, const int& CID) { 00320 int NIdx = NIDToIdx.GetKeyId(NID); 00321 if (F[NIdx].IsKey(CID)) { 00322 return F[NIdx].GetDat(CID); 00323 } else { 00324 return 0.0; 00325 } 00326 } 00327 double GetLassoCoef() { return LassoCoef; } 00328 void InitW() { // initialize W 00329 W.Gen(Attrs); 00330 for (int k = 0; k < Attrs; k++) { 00331 W[k].Gen(NumComs + 1); 00332 } 00333 } 00334 void SetAttrHoldOut(const int NID, const int KID) { 00335 int NIdx = NIDToIdx.GetKeyId(NID); 00336 HOKIDSV[NIdx].AddKey(KID); 00337 } 00338 void SetAttrHoldOutForOneNode(const int NID) { 00339 for (int k = 0; k < Attrs; k++) { 00340 SetAttrHoldOut(NID, k); 00341 } 00342 } 00343 void GetW(TVec<TFltV>& _W) { _W = W; } 00344 void SetW(TVec<TFltV>& _W) { W = _W; } 00345 void RandomInit(const int InitComs); 00346 void NeighborComInit(const int InitComs); 00347 void NeighborComInit(TFltIntPrV& NIdPhiV, const int InitComs); 00348 int GetNumComs() { return NumComs; } 00349 void SetCmtyVV(const TVec<TIntV>& CmtyVV); 00350 double Likelihood(const bool DoParallel = false); 00351 double LikelihoodForRow(const int UID); 00352 double LikelihoodForRow(const int UID, const TIntFltH& FU); 00353 double LikelihoodAttrKForRow(const int UID, const int K) { return LikelihoodAttrKForRow(UID, K, F[UID]); } 00354 double LikelihoodAttrKForRow(const int UID, const int K, const TIntFltH& FU) { return LikelihoodAttrKForRow(UID, K, FU, W[K]); } 00355 double LikelihoodAttrKForRow(const int UID, const int K, const TIntFltH& FU, const TFltV& WK); 00356 double LikelihoodForWK(const int K, const TFltV& WK) { 00357 double L = 0.0; 00358 for (int u = 0; u < F.Len(); u++) { 00359 if (HOKIDSV[u].IsKey(K)) { continue; } 00360 L += LikelihoodAttrKForRow(u, K, F[u], WK); 00361 } 00362 for (int c = 0; c < WK.Len() - 1; c++) { 00363 L -= LassoCoef * fabs(WK[c]); 00364 } 00365 return L; 00366 } 00367 double LikelihoodForWK(const int K) { return LikelihoodForWK(K, W[K]); } 00368 double LikelihoodAttr() { 00369 double L = 0.0; 00370 for (int k = 0; k < Attrs; k++) { 00371 for (int u = 0; u < F.Len(); u++) { 00372 if (HOKIDSV[u].IsKey(k)) { continue; } 00373 L += LikelihoodAttrKForRow(u, k, F[u], W[k]); 00374 } 00375 } 00376 return L; 00377 } 00378 double LikelihoodGraph() { 00379 double L = Likelihood(); 00380 //add regularization 00381 if (RegCoef > 0.0) { //L1 00382 for (int u = 0; u < F.Len(); u++) { 00383 L += RegCoef * Sum(F[u]); 00384 } 00385 } 00386 if (RegCoef < 0.0) { //L2 00387 for (int u = 0; u < F.Len(); u++) { 00388 L -= RegCoef * Norm2(F[u]); 00389 } 00390 } 00391 00392 return L - WeightAttr * LikelihoodAttr(); 00393 } 00394 void GenHoldOutAttr(const double HOFrac, TVec<TIntSet>& HOSetV) { 00395 HOSetV.Gen(F.Len()); 00396 int HoldOutCnt = (int) ceil(HOFrac * G->GetNodes() * Attrs); 00397 TIntPrSet NIDKIDSet(HoldOutCnt); 00398 int Cnt = 0; 00399 for (int h = 0; h < 10 * HoldOutCnt; h++) { 00400 int UID = Rnd.GetUniDevInt(F.Len()); 00401 int KID = Rnd.GetUniDevInt(Attrs); 00402 if (! NIDKIDSet.IsKey(TIntPr(UID, KID))) { 00403 NIDKIDSet.AddKey(TIntPr(UID, KID)); 00404 HOSetV[UID].AddKey(KID); 00405 Cnt++; 00406 } 00407 if (Cnt >= HoldOutCnt) { break; } 00408 } 00409 printf("%d hold out pairs generated for attributes\n", Cnt); 00410 } 00411 void SetHoldOut(const double HOFrac) { 00412 TVec<TIntSet> HoldOut; 00413 TCesnaUtil::GenHoldOutPairs(G, HoldOut, HOFrac, Rnd); 00414 GenHoldOutAttr(HOFrac, HOKIDSV); 00415 HOVIDSV = HoldOut; 00416 } 00417 void GradientForRow(const int UID, TIntFltH& GradU, const TIntSet& CIDSet); 00418 void GradientForWK(TFltV& GradV, const int K) { 00419 GradV.Gen(NumComs + 1); 00420 for (int u = 0; u < F.Len(); u++) { 00421 if (HOKIDSV[u].IsKey(K)) { continue; } 00422 double Pred = PredictAttrK(u, K); 00423 for (TIntFltH::TIter CI = F[u].BegI(); CI < F[u].EndI(); CI++) { 00424 GradV[CI.GetKey()] += (GetAttr(u, K) - Pred) * GetCom(u, CI.GetKey()); 00425 } 00426 GradV[NumComs] += (GetAttr(u, K) - Pred); 00427 } 00428 00429 for (int c = 0; c < GradV.Len() - 1; c++) { 00430 GradV[c] -= LassoCoef * TMath::Sign(GetW(c, K)); 00431 } 00432 } 00433 void GetCmtyVV(TVec<TIntV>& CmtyVV); 00434 void GetCmtyVV(TVec<TIntV>& CmtyVV, TVec<TFltV>& Wck, const double Thres, const int MinSz = 3); 00435 void GetCmtyVV(TVec<TIntV>& CmtyVV, const double Thres, const int MinSz = 3) { 00436 TVec<TFltV> TmpV; 00437 GetCmtyVV(CmtyVV, TmpV, Thres, MinSz); 00438 } 00439 void GetCmtyVV(TVec<TIntV>& CmtyVV, TVec<TFltV>& Wck) { 00440 GetCmtyVV(CmtyVV, Wck, sqrt(2.0 * (double) G->GetEdges() / G->GetNodes() / G->GetNodes()), 3); 00441 } 00442 00443 void GetCmtyVVUnSorted(TVec<TIntV>& CmtyVV); 00444 void GetCmtyVVUnSorted(TVec<TIntV>& CmtyVV, const double Thres, const int MinSz = 3); 00445 /* GetCmtyVVRelative: NOT working well (low accuracy) 00446 void GetCmtyVVRelative(TVec<TIntV>& CmtyVV, const int MinSz = 3) { 00447 CmtyVV.Clr(); 00448 for (int c = 0; c < NumComs; c++) { 00449 TIntV CmtyV; 00450 double MaxVal = 0.0; 00451 for (int u = 0; u < G->GetNodes(); u++) { 00452 if (GetCom(u, c) > MaxVal) { MaxVal = GetCom(u, c); } 00453 } 00454 if (MaxVal == 0.0) { continue; } 00455 for (int u = 0; u < G->GetNodes(); u++) { 00456 if (GetCom(u, c) > 0.5 * MaxVal) { CmtyV.Add(NIDToIdx[u]); } 00457 } 00458 if (CmtyV.Len() >= MinSz) { CmtyVV.Add(CmtyV); } 00459 } 00460 if ( NumComs != CmtyVV.Len()) { 00461 printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len()); 00462 } 00463 } 00464 */ 00465 int FindComs(TIntV& ComsV, const bool UseBIC = false, const double HOFrac = 0.2, const int NumThreads = 20, const TStr PlotLFNm = TStr(), const double StepAlpha = 0.3, const double StepBeta = 0.1); 00466 int FindComs(const int NumThreads, const int MaxComs, const int MinComs, const int DivComs, const TStr OutFNm, const bool UseBIC = false, const double HOFrac = 0.1, const double StepAlpha = 0.3, const double StepBeta = 0.3); 00467 void DisplayAttrs(const int TopK, const TStrHash<TInt>& NodeNameH) { 00468 for (int u = 0; u < X.Len(); u++) { 00469 if (NodeNameH.Len() > 0) { 00470 printf("NID: %s\t Attrs: ", NodeNameH.GetKey(NIDToIdx[u])); 00471 } else { 00472 printf("NID: %d\t Attrs: ", NIDToIdx[u].Val); 00473 } 00474 for (int k = 0; k < X[u].Len(); k++) { 00475 printf("%d, ", X[u][k].Val); 00476 } 00477 printf("\n"); 00478 if (u >= TopK) { break; } 00479 } 00480 } 00481 double LikelihoodHoldOut(); 00482 double GetStepSizeByLineSearch(const int UID, const TIntFltH& DeltaV, const TIntFltH& GradV, const double& Alpha, const double& Beta, const int MaxIter = 10); 00483 double GetStepSizeByLineSearchForWK(const int K, const TFltV& DeltaV, const TFltV& GradV, const double& Alpha, const double& Beta, const int MaxIter = 10) { 00484 double StepSize = 1.0; 00485 double InitLikelihood = LikelihoodForWK(K); 00486 TFltV NewVarV(DeltaV.Len()); 00487 IAssert(DeltaV.Len() == NumComs + 1); 00488 for(int iter = 0; iter < MaxIter; iter++) { 00489 for (int c = 0; c < DeltaV.Len(); c++){ 00490 double NewVal = W[K][c] + StepSize * DeltaV[c]; 00491 if (NewVal < MinValW) { NewVal = MinValW; } 00492 if (NewVal > MaxValW) { NewVal = MaxValW; } 00493 NewVarV[c] = NewVal; 00494 } 00495 if (LikelihoodForWK(K, NewVarV) < InitLikelihood + Alpha * StepSize * TLinAlg::DotProduct(GradV, DeltaV)) { 00496 StepSize *= Beta; 00497 } else { 00498 break; 00499 } 00500 if (iter == MaxIter - 1) { 00501 StepSize = 0.0; 00502 break; 00503 } 00504 } 00505 return StepSize; 00506 } 00507 int GetPositiveW() { 00508 int PosCnt = 0; 00509 for (int c = 0; c < NumComs; c++) { 00510 for (int k = 0; k < Attrs; k++) { 00511 if (GetW(c, k) > 0.0) { PosCnt++; } 00512 } 00513 } 00514 return PosCnt; 00515 } 00516 int MLEGradAscent(const double& Thres, const int& MaxIter, const TStr PlotNm, const double StepAlpha = 0.3, const double StepBeta = 0.1); 00517 int MLEGradAscentParallel(const double& Thres, const int& MaxIter, const int ChunkNum, const int ChunkSize, const TStr PlotNm, const double StepAlpha = 0.3, const double StepBeta = 0.1); 00518 int MLEGradAscentParallel(const double& Thres, const int& MaxIter, const int ChunkNum, const TStr PlotNm = TStr(), const double StepAlpha = 0.3, const double StepBeta = 0.1) { 00519 int ChunkSize = G->GetNodes() / 10 / ChunkNum; 00520 if (ChunkSize == 0) { ChunkSize = 1; } 00521 return MLEGradAscentParallel(Thres, MaxIter, ChunkNum, ChunkSize, PlotNm, StepAlpha, StepBeta); 00522 } 00523 //double FindOptimalThres(const TVec<TIntV>& TrueCmtyVV, TVec<TIntV>& CmtyVV); 00524 double inline GetCom(const int& NID, const int& CID) { 00525 if (F[NID].IsKey(CID)) { 00526 return F[NID].GetDat(CID); 00527 } else { 00528 return 0.0; 00529 } 00530 } 00531 double inline GetAttr(const int& NID, const int& K) { 00532 if (X[NID].IsKey(K)) { 00533 return 1.0; 00534 } else { 00535 return 0.0; 00536 } 00537 } 00538 void inline AddCom(const int& NID, const int& CID, const double& Val) { 00539 if (F[NID].IsKey(CID)) { 00540 SumFV[CID] -= F[NID].GetDat(CID); 00541 } 00542 F[NID].AddDat(CID) = Val; 00543 SumFV[CID] += Val; 00544 } 00545 00546 void inline DelCom(const int& NID, const int& CID) { 00547 if (F[NID].IsKey(CID)) { 00548 SumFV[CID] -= F[NID].GetDat(CID); 00549 F[NID].DelKey(CID); 00550 } 00551 } 00552 /* 00553 double inline DotProduct(const TIntFltH& UV, const TFltV& VV) { 00554 double DP = 0; 00555 for (TIntFltH::TIter HI = UV.BegI(); HI < UV.EndI(); HI++) { 00556 DP += VV[HI.GetKey()] * HI.GetDat(); 00557 } 00558 return DP; 00559 } 00560 */ 00561 double inline DotProduct(const TIntFltH& UV, const TIntFltH& VV) { 00562 double DP = 0; 00563 if (UV.Len() > VV.Len()) { 00564 for (TIntFltH::TIter HI = UV.BegI(); HI < UV.EndI(); HI++) { 00565 if (VV.IsKey(HI.GetKey())) { 00566 DP += VV.GetDat(HI.GetKey()) * HI.GetDat(); 00567 } 00568 } 00569 } else { 00570 for (TIntFltH::TIter HI = VV.BegI(); HI < VV.EndI(); HI++) { 00571 if (UV.IsKey(HI.GetKey())) { 00572 DP += UV.GetDat(HI.GetKey()) * HI.GetDat(); 00573 } 00574 } 00575 } 00576 return DP; 00577 } 00578 double inline DotProduct(const int& UID, const int& VID) { 00579 return DotProduct(F[UID], F[VID]); 00580 } 00581 double inline Prediction(const TIntFltH& FU, const TIntFltH& FV) { 00582 double DP = log (1.0 / (1.0 - PNoCom)) + DotProduct(FU, FV); 00583 IAssertR(DP > 0.0, TStr::Fmt("DP: %f", DP)); 00584 return exp(- DP); 00585 } 00586 double inline PredictAttrK(const TIntFltH& FU, const TFltV& WK) { 00587 double DP = 0.0; 00588 for (TIntFltH::TIter FI = FU.BegI(); FI < FU.EndI(); FI++) { 00589 DP += FI.GetDat() * WK[FI.GetKey()]; 00590 } 00591 DP += WK.Last(); 00592 return Sigmoid(DP); 00593 } 00594 double inline PredictAttrK(const TIntFltH& FU, const int K) { 00595 return PredictAttrK(FU, W[K]); 00596 } 00597 double inline PredictAttrK(const int UID, const int K) { 00598 return PredictAttrK(F[UID], W[K]); 00599 } 00600 double inline GetW(const int CID, const int K) { 00601 return W[K][CID]; 00602 } 00603 double inline Prediction(const int& UID, const int& VID) { 00604 return Prediction(F[UID], F[VID]); 00605 } 00606 double inline Sum(const TIntFltH& UV) { 00607 double N = 0.0; 00608 for (TIntFltH::TIter HI = UV.BegI(); HI < UV.EndI(); HI++) { 00609 N += HI.GetDat(); 00610 } 00611 return N; 00612 } 00613 double inline Norm2(const TIntFltH& UV) { 00614 double N = 0.0; 00615 for (TIntFltH::TIter HI = UV.BegI(); HI < UV.EndI(); HI++) { 00616 N += HI.GetDat() * HI.GetDat(); 00617 } 00618 return N; 00619 } 00620 /* 00621 double inline Norm1(const TFltV& UV) { 00622 double N = 0.0; 00623 for (int i = 0; i < UV.Len(); i++) { 00624 N += fabs(UV[i]); 00625 } 00626 return N; 00627 } 00628 */ 00629 double inline Sigmoid(const double X) { 00630 return 1.0 / ( 1.0 + exp(-X)); 00631 } 00632 }; 00633 00634 00635 #endif