SNAP Library 4.1, Developer Reference  2018-07-26 16:30:42
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
word2vec.cpp File Reference
#include "stdafx.h"
#include "Snap.h"
#include "word2vec.h"
Include dependency graph for word2vec.cpp:

Go to the source code of this file.

Functions

void LearnVocab (TVVec< TInt, int64 > &WalksVV, TIntV &Vocab)
 
void InitUnigramTable (TIntV &Vocab, TIntV &KTable, TFltV &UTable)
 
int64 RndUnigramInt (TIntV &KTable, TFltV &UTable, TRnd &Rnd)
 
void InitNegEmb (TIntV &Vocab, const int &Dimensions, TVVec< TFlt, int64 > &SynNeg)
 
void InitPosEmb (TIntV &Vocab, const int &Dimensions, TRnd &Rnd, TVVec< TFlt, int64 > &SynPos)
 
void TrainModel (TVVec< TInt, int64 > &WalksVV, const int &Dimensions, const int &WinSize, const int &Iter, const bool &Verbose, TIntV &KTable, TFltV &UTable, int64 &WordCntAll, TFltV &ExpTable, double &Alpha, int64 CurrWalk, TRnd &Rnd, TVVec< TFlt, int64 > &SynNeg, TVVec< TFlt, int64 > &SynPos)
 
void LearnEmbeddings (TVVec< TInt, int64 > &WalksVV, const int &Dimensions, const int &WinSize, const int &Iter, const bool &Verbose, TIntFltVH &EmbeddingsHV)
 Learns embeddings using SGD, Skip-gram with negative sampling. More...
 

Function Documentation

void InitNegEmb ( TIntV Vocab,
const int &  Dimensions,
TVVec< TFlt, int64 > &  SynNeg 
)

Definition at line 73 of file word2vec.cpp.

References TVVec< TVal, TSizeTy >::GetXDim(), TVVec< TVal, TSizeTy >::GetYDim(), and TVec< TVal, TSizeTy >::Len().

Referenced by LearnEmbeddings().

73  {
74  SynNeg = TVVec<TFlt, int64>(Vocab.Len(),Dimensions);
75  for (int64 i = 0; i < SynNeg.GetXDim(); i++) {
76  for (int j = 0; j < SynNeg.GetYDim(); j++) {
77  SynNeg(i,j) = 0;
78  }
79  }
80 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
Definition: ds.h:2223
TSizeTy GetYDim() const
Definition: ds.h:2251
long long int64
Definition: bd.h:27
TSizeTy GetXDim() const
Definition: ds.h:2250

Here is the call graph for this function:

Here is the caller graph for this function:

void InitPosEmb ( TIntV Vocab,
const int &  Dimensions,
TRnd Rnd,
TVVec< TFlt, int64 > &  SynPos 
)

Definition at line 83 of file word2vec.cpp.

References TRnd::GetUniDev(), TVVec< TVal, TSizeTy >::GetXDim(), TVVec< TVal, TSizeTy >::GetYDim(), and TVec< TVal, TSizeTy >::Len().

Referenced by LearnEmbeddings().

83  {
84  SynPos = TVVec<TFlt, int64>(Vocab.Len(),Dimensions);
85  for (int64 i = 0; i < SynPos.GetXDim(); i++) {
86  for (int j = 0; j < SynPos.GetYDim(); j++) {
87  SynPos(i,j) =(Rnd.GetUniDev()-0.5)/Dimensions;
88  }
89  }
90 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
Definition: ds.h:2223
TSizeTy GetYDim() const
Definition: ds.h:2251
long long int64
Definition: bd.h:27
TSizeTy GetXDim() const
Definition: ds.h:2250
double GetUniDev()
Definition: dt.h:30

Here is the call graph for this function:

Here is the caller graph for this function:

void InitUnigramTable ( TIntV Vocab,
TIntV KTable,
TFltV UTable 
)

Definition at line 18 of file word2vec.cpp.

References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::DelLast(), TVec< TVal, TSizeTy >::Last(), TVec< TVal, TSizeTy >::Len(), and TMath::Power().

Referenced by LearnEmbeddings().

18  {
19  double TrainWordsPow = 0;
20  double Pwr = 0.75;
21  TFltV ProbV(Vocab.Len());
22  for (int64 i = 0; i < Vocab.Len(); i++) {
23  ProbV[i]=TMath::Power(Vocab[i],Pwr);
24  TrainWordsPow += ProbV[i];
25  KTable[i]=0;
26  UTable[i]=0;
27  }
28  for (int64 i = 0; i < ProbV.Len(); i++) {
29  ProbV[i] /= TrainWordsPow;
30  }
31  TIntV UnderV;
32  TIntV OverV;
33  for (int64 i = 0; i < ProbV.Len(); i++) {
34  UTable[i] = ProbV[i] * ProbV.Len();
35  if ( UTable[i] < 1 ) {
36  UnderV.Add(i);
37  } else {
38  OverV.Add(i);
39  }
40  }
41  while(UnderV.Len() > 0 && OverV.Len() > 0) {
42  int64 Small = UnderV.Last();
43  int64 Large = OverV.Last();
44  UnderV.DelLast();
45  OverV.DelLast();
46  KTable[Small] = Large;
47  UTable[Large] = (UTable[Large] + UTable[Small]) - 1;
48  if (UTable[Large] < 1) {
49  UnderV.Add(Large);
50  } else {
51  OverV.Add(Large);
52  }
53  }
54  while(UnderV.Len() > 0){
55  int64 curr = UnderV.Last();
56  UnderV.DelLast();
57  UTable[curr]=1;
58  }
59  while(OverV.Len() > 0){
60  int64 curr = OverV.Last();
61  OverV.DelLast();
62  UTable[curr]=1;
63  }
64 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
static double Power(const double &Base, const double &Exponent)
Definition: xmath.h:25
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
long long int64
Definition: bd.h:27
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
void DelLast()
Removes the last element of the vector.
Definition: ds.h:665

Here is the call graph for this function:

Here is the caller graph for this function:

void LearnEmbeddings ( TVVec< TInt, int64 > &  WalksVV,
const int &  Dimensions,
const int &  WinSize,
const int &  Iter,
const bool &  Verbose,
TIntFltVH EmbeddingsHV 
)

Learns embeddings using SGD, Skip-gram with negative sampling.

Definition at line 160 of file word2vec.cpp.

References THash< TKey, TDat, THashFunc >::AddDat(), TMath::E, ExpTablePrecision, THash< TKey, TDat, THashFunc >::GetDat(), TVVec< TVal, TSizeTy >::GetXDim(), TVVec< TVal, TSizeTy >::GetYDim(), InitNegEmb(), InitPosEmb(), InitUnigramTable(), THash< TKey, TDat, THashFunc >::IsKey(), LearnVocab(), MaxExp, TMath::Power(), StartAlpha, TableSize, and TrainModel().

Referenced by node2vec().

162  {
163  TIntIntH RnmH;
164  TIntIntH RnmBackH;
165  int64 NNodes = 0;
166  //renaming nodes into consecutive numbers
167  for (int i = 0; i < WalksVV.GetXDim(); i++) {
168  for (int64 j = 0; j < WalksVV.GetYDim(); j++) {
169  if ( RnmH.IsKey(WalksVV(i, j)) ) {
170  WalksVV(i, j) = RnmH.GetDat(WalksVV(i, j));
171  } else {
172  RnmH.AddDat(WalksVV(i,j),NNodes);
173  RnmBackH.AddDat(NNodes,WalksVV(i, j));
174  WalksVV(i, j) = NNodes++;
175  }
176  }
177  }
178  TIntV Vocab(NNodes);
179  LearnVocab(WalksVV, Vocab);
180  TIntV KTable(NNodes);
181  TFltV UTable(NNodes);
182  TVVec<TFlt, int64> SynNeg;
183  TVVec<TFlt, int64> SynPos;
184  TRnd Rnd(time(NULL));
185  InitPosEmb(Vocab, Dimensions, Rnd, SynPos);
186  InitNegEmb(Vocab, Dimensions, SynNeg);
187  InitUnigramTable(Vocab, KTable, UTable);
188  TFltV ExpTable(TableSize);
189  double Alpha = StartAlpha; //learning rate
190 #pragma omp parallel for schedule(dynamic)
191  for (int i = 0; i < TableSize; i++ ) {
192  double Value = -MaxExp + static_cast<double>(i) / static_cast<double>(ExpTablePrecision);
193  ExpTable[i] = TMath::Power(TMath::E, Value);
194  }
195  int64 WordCntAll = 0;
196 // op RS 2016/09/26, collapse does not compile on Mac OS X
197 //#pragma omp parallel for schedule(dynamic) collapse(2)
198  for (int j = 0; j < Iter; j++) {
199 #pragma omp parallel for schedule(dynamic)
200  for (int64 i = 0; i < WalksVV.GetXDim(); i++) {
201  TrainModel(WalksVV, Dimensions, WinSize, Iter, Verbose, KTable, UTable,
202  WordCntAll, ExpTable, Alpha, i, Rnd, SynNeg, SynPos);
203  }
204  }
205  if (Verbose) { printf("\n"); fflush(stdout); }
206  for (int64 i = 0; i < SynPos.GetXDim(); i++) {
207  TFltV CurrV(SynPos.GetYDim());
208  for (int j = 0; j < SynPos.GetYDim(); j++) { CurrV[j] = SynPos(i, j); }
209  EmbeddingsHV.AddDat(RnmBackH.GetDat(i), CurrV);
210  }
211 }
Definition: dt.h:11
void InitUnigramTable(TIntV &Vocab, TIntV &KTable, TFltV &UTable)
Definition: word2vec.cpp:18
void InitNegEmb(TIntV &Vocab, const int &Dimensions, TVVec< TFlt, int64 > &SynNeg)
Definition: word2vec.cpp:73
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:262
const int MaxExp
Definition: word2vec.h:10
Definition: ds.h:2223
TSizeTy GetYDim() const
Definition: ds.h:2251
static double Power(const double &Base, const double &Exponent)
Definition: xmath.h:25
void TrainModel(TVVec< TInt, int64 > &WalksVV, const int &Dimensions, const int &WinSize, const int &Iter, const bool &Verbose, TIntV &KTable, TFltV &UTable, int64 &WordCntAll, TFltV &ExpTable, double &Alpha, int64 CurrWalk, TRnd &Rnd, TVVec< TFlt, int64 > &SynNeg, TVVec< TFlt, int64 > &SynPos)
Definition: word2vec.cpp:92
long long int64
Definition: bd.h:27
const double StartAlpha
Definition: word2vec.h:20
TSizeTy GetXDim() const
Definition: ds.h:2250
Definition: hash.h:97
const int ExpTablePrecision
Definition: word2vec.h:13
void LearnVocab(TVVec< TInt, int64 > &WalksVV, TIntV &Vocab)
Definition: word2vec.cpp:8
const int TableSize
Definition: word2vec.h:14
bool IsKey(const TKey &Key) const
Definition: hash.h:258
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
static double E
Definition: xmath.h:7
void InitPosEmb(TIntV &Vocab, const int &Dimensions, TRnd &Rnd, TVVec< TFlt, int64 > &SynPos)
Definition: word2vec.cpp:83

Here is the call graph for this function:

Here is the caller graph for this function:

void LearnVocab ( TVVec< TInt, int64 > &  WalksVV,
TIntV Vocab 
)

Definition at line 8 of file word2vec.cpp.

References TVVec< TVal, TSizeTy >::GetXDim(), TVVec< TVal, TSizeTy >::GetYDim(), and TVec< TVal, TSizeTy >::Len().

Referenced by LearnEmbeddings().

8  {
9  for( int64 i = 0; i < Vocab.Len(); i++) { Vocab[i] = 0; }
10  for( int64 i = 0; i < WalksVV.GetXDim(); i++) {
11  for( int j = 0; j < WalksVV.GetYDim(); j++) {
12  Vocab[WalksVV(i,j)]++;
13  }
14  }
15 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
TSizeTy GetYDim() const
Definition: ds.h:2251
long long int64
Definition: bd.h:27
TSizeTy GetXDim() const
Definition: ds.h:2250

Here is the call graph for this function:

Here is the caller graph for this function:

int64 RndUnigramInt ( TIntV KTable,
TFltV UTable,
TRnd Rnd 
)

Definition at line 66 of file word2vec.cpp.

References TRnd::GetUniDev(), and TVec< TVal, TSizeTy >::Len().

Referenced by TrainModel().

66  {
67  TInt X = KTable[static_cast<int64>(Rnd.GetUniDev()*KTable.Len())];
68  double Y = Rnd.GetUniDev();
69  return Y < UTable[X] ? X : KTable[X];
70 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
Definition: dt.h:1134
long long int64
Definition: bd.h:27
double GetUniDev()
Definition: dt.h:30

Here is the call graph for this function:

Here is the caller graph for this function:

void TrainModel ( TVVec< TInt, int64 > &  WalksVV,
const int &  Dimensions,
const int &  WinSize,
const int &  Iter,
const bool &  Verbose,
TIntV KTable,
TFltV UTable,
int64 WordCntAll,
TFltV ExpTable,
double &  Alpha,
int64  CurrWalk,
TRnd Rnd,
TVVec< TFlt, int64 > &  SynNeg,
TVVec< TFlt, int64 > &  SynPos 
)

Definition at line 92 of file word2vec.cpp.

References ExpTablePrecision, TRnd::GetUniDevInt(), TVVec< TVal, TSizeTy >::GetXDim(), TVVec< TVal, TSizeTy >::GetYDim(), MaxExp, NegSamN, RndUnigramInt(), StartAlpha, and TableSize.

Referenced by LearnEmbeddings().

96  {
97  TFltV Neu1V(Dimensions);
98  TFltV Neu1eV(Dimensions);
99  int64 AllWords = WalksVV.GetXDim()*WalksVV.GetYDim();
100  TIntV WalkV(WalksVV.GetYDim());
101  for (int j = 0; j < WalksVV.GetYDim(); j++) { WalkV[j] = WalksVV(CurrWalk,j); }
102  for (int64 WordI=0; WordI<WalkV.Len(); WordI++) {
103  if ( WordCntAll%10000 == 0 ) {
104  if ( Verbose ) {
105  printf("\rLearning Progress: %.2lf%% ",(double)WordCntAll*100/(double)(Iter*AllWords));
106  fflush(stdout);
107  }
108  Alpha = StartAlpha * (1 - WordCntAll / static_cast<double>(Iter * AllWords + 1));
109  if ( Alpha < StartAlpha * 0.0001 ) { Alpha = StartAlpha * 0.0001; }
110  }
111  int64 Word = WalkV[WordI];
112  for (int i = 0; i < Dimensions; i++) {
113  Neu1V[i] = 0;
114  Neu1eV[i] = 0;
115  }
116  int Offset = Rnd.GetUniDevInt() % WinSize;
117  for (int a = Offset; a < WinSize * 2 + 1 - Offset; a++) {
118  if (a == WinSize) { continue; }
119  int64 CurrWordI = WordI - WinSize + a;
120  if (CurrWordI < 0){ continue; }
121  if (CurrWordI >= WalkV.Len()){ continue; }
122  int64 CurrWord = WalkV[CurrWordI];
123  for (int i = 0; i < Dimensions; i++) { Neu1eV[i] = 0; }
124  //negative sampling
125  for (int j = 0; j < NegSamN+1; j++) {
126  int64 Target, Label;
127  if (j == 0) {
128  Target = Word;
129  Label = 1;
130  } else {
131  Target = RndUnigramInt(KTable, UTable, Rnd);
132  if (Target == Word) { continue; }
133  Label = 0;
134  }
135  double Product = 0;
136  for (int i = 0; i < Dimensions; i++) {
137  Product += SynPos(CurrWord,i) * SynNeg(Target,i);
138  }
139  double Grad; //Gradient multiplied by learning rate
140  if (Product > MaxExp) { Grad = (Label - 1) * Alpha; }
141  else if (Product < -MaxExp) { Grad = Label * Alpha; }
142  else {
143  double Exp = ExpTable[static_cast<int>(Product*ExpTablePrecision)+TableSize/2];
144  Grad = (Label - 1 + 1 / (1 + Exp)) * Alpha;
145  }
146  for (int i = 0; i < Dimensions; i++) {
147  Neu1eV[i] += Grad * SynNeg(Target,i);
148  SynNeg(Target,i) += Grad * SynPos(CurrWord,i);
149  }
150  }
151  for (int i = 0; i < Dimensions; i++) {
152  SynPos(CurrWord,i) += Neu1eV[i];
153  }
154  }
155  WordCntAll++;
156  }
157 }
const int MaxExp
Definition: word2vec.h:10
TSizeTy GetYDim() const
Definition: ds.h:2251
long long int64
Definition: bd.h:27
const double StartAlpha
Definition: word2vec.h:20
TSizeTy GetXDim() const
Definition: ds.h:2250
const int ExpTablePrecision
Definition: word2vec.h:13
const int TableSize
Definition: word2vec.h:14
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
int64 RndUnigramInt(TIntV &KTable, TFltV &UTable, TRnd &Rnd)
Definition: word2vec.cpp:66
const int NegSamN
Definition: word2vec.h:17

Here is the call graph for this function:

Here is the caller graph for this function: