SNAP Library 4.0, User Reference  2017-07-27 13:18:06
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
word2vec.cpp File Reference
#include "stdafx.h"
#include "Snap.h"
#include "word2vec.h"

Go to the source code of this file.

Functions

void LearnVocab (TVVec< TInt, int64 > &WalksVV, TIntV &Vocab)
 
void InitUnigramTable (TIntV &Vocab, TIntV &KTable, TFltV &UTable)
 
int64 RndUnigramInt (TIntV &KTable, TFltV &UTable, TRnd &Rnd)
 
void InitNegEmb (TIntV &Vocab, int &Dimensions, TVVec< TFlt, int64 > &SynNeg)
 
void InitPosEmb (TIntV &Vocab, int &Dimensions, TRnd &Rnd, TVVec< TFlt, int64 > &SynPos)
 
void TrainModel (TVVec< TInt, int64 > &WalksVV, int &Dimensions, int &WinSize, int &Iter, bool &Verbose, TIntV &KTable, TFltV &UTable, int64 &WordCntAll, TFltV &ExpTable, double &Alpha, int64 CurrWalk, TRnd &Rnd, TVVec< TFlt, int64 > &SynNeg, TVVec< TFlt, int64 > &SynPos)
 
void LearnEmbeddings (TVVec< TInt, int64 > &WalksVV, int &Dimensions, int &WinSize, int &Iter, bool &Verbose, TIntFltVH &EmbeddingsHV)
 Learns embeddings using SGD, Skip-gram with negative sampling. More...
 

Function Documentation

void InitNegEmb ( TIntV Vocab,
int &  Dimensions,
TVVec< TFlt, int64 > &  SynNeg 
)

Definition at line 63 of file word2vec.cpp.

63  {
64  SynNeg = TVVec<TFlt, int64>(Vocab.Len(),Dimensions);
65  for (int64 i = 0; i < SynNeg.GetXDim(); i++) {
66  for (int j = 0; j < SynNeg.GetYDim(); j++) {
67  SynNeg(i,j) = 0;
68  }
69  }
70 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
Definition: ds.h:2222
TSizeTy GetYDim() const
Definition: ds.h:2250
long long int64
Definition: bd.h:27
TSizeTy GetXDim() const
Definition: ds.h:2249
void InitPosEmb ( TIntV Vocab,
int &  Dimensions,
TRnd Rnd,
TVVec< TFlt, int64 > &  SynPos 
)

Definition at line 73 of file word2vec.cpp.

73  {
74  SynPos = TVVec<TFlt, int64>(Vocab.Len(),Dimensions);
75  for (int64 i = 0; i < SynPos.GetXDim(); i++) {
76  for (int j = 0; j < SynPos.GetYDim(); j++) {
77  SynPos(i,j) =(Rnd.GetUniDev()-0.5)/Dimensions;
78  }
79  }
80 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
Definition: ds.h:2222
TSizeTy GetYDim() const
Definition: ds.h:2250
long long int64
Definition: bd.h:27
TSizeTy GetXDim() const
Definition: ds.h:2249
double GetUniDev()
Definition: dt.h:30
void InitUnigramTable ( TIntV Vocab,
TIntV KTable,
TFltV UTable 
)

Definition at line 18 of file word2vec.cpp.

18  {
19  double TrainWordsPow = 0;
20  double Pwr = 0.75;
21  TFltV ProbV(Vocab.Len());
22  for (int64 i = 0; i < Vocab.Len(); i++) {
23  ProbV[i]=TMath::Power(Vocab[i],Pwr);
24  TrainWordsPow += ProbV[i];
25  KTable[i]=0;
26  UTable[i]=0;
27  }
28  for (int64 i = 0; i < ProbV.Len(); i++) {
29  ProbV[i] /= TrainWordsPow;
30  }
31  TIntV UnderV;
32  TIntV OverV;
33  for (int64 i = 0; i < ProbV.Len(); i++) {
34  UTable[i] = ProbV[i] * ProbV.Len();
35  if ( UTable[i] < 1 ) {
36  UnderV.Add(i);
37  } else {
38  OverV.Add(i);
39  }
40  }
41  while(UnderV.Len() > 0 && OverV.Len() > 0) {
42  int64 Small = UnderV.Last();
43  int64 Large = OverV.Last();
44  UnderV.DelLast();
45  OverV.DelLast();
46  KTable[Small] = Large;
47  UTable[Large] = UTable[Large] + UTable[Small] - 1;
48  if (UTable[Large] < 1) {
49  UnderV.Add(Large);
50  } else {
51  OverV.Add(Large);
52  }
53  }
54 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
static double Power(const double &Base, const double &Exponent)
Definition: xmath.h:25
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
long long int64
Definition: bd.h:27
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
void DelLast()
Removes the last element of the vector.
Definition: ds.h:665
void LearnEmbeddings ( TVVec< TInt, int64 > &  WalksVV,
int &  Dimensions,
int &  WinSize,
int &  Iter,
bool &  Verbose,
TIntFltVH EmbeddingsHV 
)

Learns embeddings using SGD, Skip-gram with negative sampling.

Definition at line 148 of file word2vec.cpp.

149  {
150  TIntIntH RnmH;
151  TIntIntH RnmBackH;
152  int64 NNodes = 0;
153  //renaming nodes into consecutive numbers
154  for (int i = 0; i < WalksVV.GetXDim(); i++) {
155  for (int64 j = 0; j < WalksVV.GetYDim(); j++) {
156  if ( RnmH.IsKey(WalksVV(i, j)) ) {
157  WalksVV(i, j) = RnmH.GetDat(WalksVV(i, j));
158  } else {
159  RnmH.AddDat(WalksVV(i,j),NNodes);
160  RnmBackH.AddDat(NNodes,WalksVV(i, j));
161  WalksVV(i, j) = NNodes++;
162  }
163  }
164  }
165  TIntV Vocab(NNodes);
166  LearnVocab(WalksVV, Vocab);
167  TIntV KTable(NNodes);
168  TFltV UTable(NNodes);
169  TVVec<TFlt, int64> SynNeg;
170  TVVec<TFlt, int64> SynPos;
171  TRnd Rnd(time(NULL));
172  InitPosEmb(Vocab, Dimensions, Rnd, SynPos);
173  InitNegEmb(Vocab, Dimensions, SynNeg);
174  InitUnigramTable(Vocab, KTable, UTable);
175  TFltV ExpTable(TableSize);
176  double Alpha = StartAlpha; //learning rate
177 #pragma omp parallel for schedule(dynamic)
178  for (int i = 0; i < TableSize; i++ ) {
179  double Value = -MaxExp + static_cast<double>(i) / static_cast<double>(ExpTablePrecision);
180  ExpTable[i] = TMath::Power(TMath::E, Value);
181  }
182  int64 WordCntAll = 0;
183 // op RS 2016/09/26, collapse does not compile on Mac OS X
184 //#pragma omp parallel for schedule(dynamic) collapse(2)
185  for (int j = 0; j < Iter; j++) {
186 #pragma omp parallel for schedule(dynamic)
187  for (int64 i = 0; i < WalksVV.GetXDim(); i++) {
188  TrainModel(WalksVV, Dimensions, WinSize, Iter, Verbose, KTable, UTable,
189  WordCntAll, ExpTable, Alpha, i, Rnd, SynNeg, SynPos);
190  }
191  }
192  if (Verbose) { printf("\n"); fflush(stdout); }
193  for (int64 i = 0; i < SynPos.GetXDim(); i++) {
194  TFltV CurrV(SynPos.GetYDim());
195  for (int j = 0; j < SynPos.GetYDim(); j++) { CurrV[j] = SynPos(i, j); }
196  EmbeddingsHV.AddDat(RnmBackH.GetDat(i), CurrV);
197  }
198 }
Definition: dt.h:11
void InitPosEmb(TIntV &Vocab, int &Dimensions, TRnd &Rnd, TVVec< TFlt, int64 > &SynPos)
Definition: word2vec.cpp:73
void InitUnigramTable(TIntV &Vocab, TIntV &KTable, TFltV &UTable)
Definition: word2vec.cpp:18
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:262
const int MaxExp
Definition: word2vec.h:9
Definition: ds.h:2222
TSizeTy GetYDim() const
Definition: ds.h:2250
static double Power(const double &Base, const double &Exponent)
Definition: xmath.h:25
void TrainModel(TVVec< TInt, int64 > &WalksVV, int &Dimensions, int &WinSize, int &Iter, bool &Verbose, TIntV &KTable, TFltV &UTable, int64 &WordCntAll, TFltV &ExpTable, double &Alpha, int64 CurrWalk, TRnd &Rnd, TVVec< TFlt, int64 > &SynNeg, TVVec< TFlt, int64 > &SynPos)
Definition: word2vec.cpp:82
void InitNegEmb(TIntV &Vocab, int &Dimensions, TVVec< TFlt, int64 > &SynNeg)
Definition: word2vec.cpp:63
long long int64
Definition: bd.h:27
const double StartAlpha
Definition: word2vec.h:19
TSizeTy GetXDim() const
Definition: ds.h:2249
Definition: hash.h:97
const int ExpTablePrecision
Definition: word2vec.h:12
void LearnVocab(TVVec< TInt, int64 > &WalksVV, TIntV &Vocab)
Definition: word2vec.cpp:8
const int TableSize
Definition: word2vec.h:13
bool IsKey(const TKey &Key) const
Definition: hash.h:258
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
static double E
Definition: xmath.h:7
void LearnVocab ( TVVec< TInt, int64 > &  WalksVV,
TIntV Vocab 
)

Definition at line 8 of file word2vec.cpp.

8  {
9  for( int64 i = 0; i < Vocab.Len(); i++) { Vocab[i] = 0; }
10  for( int64 i = 0; i < WalksVV.GetXDim(); i++) {
11  for( int j = 0; j < WalksVV.GetYDim(); j++) {
12  Vocab[WalksVV(i,j)]++;
13  }
14  }
15 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
TSizeTy GetYDim() const
Definition: ds.h:2250
long long int64
Definition: bd.h:27
TSizeTy GetXDim() const
Definition: ds.h:2249
int64 RndUnigramInt ( TIntV KTable,
TFltV UTable,
TRnd Rnd 
)

Definition at line 56 of file word2vec.cpp.

56  {
57  TInt X = KTable[static_cast<int64>(Rnd.GetUniDev()*KTable.Len())];
58  double Y = Rnd.GetUniDev();
59  return Y < UTable[X] ? X : KTable[X];
60 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
Definition: dt.h:1134
long long int64
Definition: bd.h:27
double GetUniDev()
Definition: dt.h:30
void TrainModel ( TVVec< TInt, int64 > &  WalksVV,
int &  Dimensions,
int &  WinSize,
int &  Iter,
bool &  Verbose,
TIntV KTable,
TFltV UTable,
int64 WordCntAll,
TFltV ExpTable,
double &  Alpha,
int64  CurrWalk,
TRnd Rnd,
TVVec< TFlt, int64 > &  SynNeg,
TVVec< TFlt, int64 > &  SynPos 
)

Definition at line 82 of file word2vec.cpp.

84  {
85  TFltV Neu1V(Dimensions);
86  TFltV Neu1eV(Dimensions);
87  int64 AllWords = WalksVV.GetXDim()*WalksVV.GetYDim();
88  TIntV WalkV(WalksVV.GetYDim());
89  for (int j = 0; j < WalksVV.GetYDim(); j++) { WalkV[j] = WalksVV(CurrWalk,j); }
90  for (int64 WordI=0; WordI<WalkV.Len(); WordI++) {
91  if ( WordCntAll%10000 == 0 ) {
92  if ( Verbose ) {
93  printf("\rLearning Progress: %.2lf%% ",(double)WordCntAll*100/(double)(Iter*AllWords));
94  fflush(stdout);
95  }
96  Alpha = StartAlpha * (1 - WordCntAll / static_cast<double>(Iter * AllWords + 1));
97  if ( Alpha < StartAlpha * 0.0001 ) { Alpha = StartAlpha * 0.0001; }
98  }
99  int64 Word = WalkV[WordI];
100  for (int i = 0; i < Dimensions; i++) {
101  Neu1V[i] = 0;
102  Neu1eV[i] = 0;
103  }
104  int Offset = Rnd.GetUniDevInt() % WinSize;
105  for (int a = Offset; a < WinSize * 2 + 1 - Offset; a++) {
106  if (a == WinSize) { continue; }
107  int64 CurrWordI = WordI - WinSize + a;
108  if (CurrWordI < 0){ continue; }
109  if (CurrWordI >= WalkV.Len()){ continue; }
110  int64 CurrWord = WalkV[CurrWordI];
111  for (int i = 0; i < Dimensions; i++) { Neu1eV[i] = 0; }
112  //negative sampling
113  for (int j = 0; j < NegSamN+1; j++) {
114  int64 Target, Label;
115  if (j == 0) {
116  Target = Word;
117  Label = 1;
118  } else {
119  Target = RndUnigramInt(KTable, UTable, Rnd);
120  if (Target == Word) { continue; }
121  Label = 0;
122  }
123  double Product = 0;
124  for (int i = 0; i < Dimensions; i++) {
125  Product += SynPos(CurrWord,i) * SynNeg(Target,i);
126  }
127  double Grad; //Gradient multiplied by learning rate
128  if (Product > MaxExp) { Grad = (Label - 1) * Alpha; }
129  else if (Product < -MaxExp) { Grad = Label * Alpha; }
130  else {
131  double Exp = ExpTable[static_cast<int>(Product*ExpTablePrecision)+TableSize/2];
132  Grad = (Label - 1 + 1 / (1 + Exp)) * Alpha;
133  }
134  for (int i = 0; i < Dimensions; i++) {
135  Neu1eV[i] += Grad * SynNeg(Target,i);
136  SynNeg(Target,i) += Grad * SynPos(CurrWord,i);
137  }
138  }
139  for (int i = 0; i < Dimensions; i++) {
140  SynPos(CurrWord,i) += Neu1eV[i];
141  }
142  }
143  WordCntAll++;
144  }
145 }
const int MaxExp
Definition: word2vec.h:9
TSizeTy GetYDim() const
Definition: ds.h:2250
long long int64
Definition: bd.h:27
const double StartAlpha
Definition: word2vec.h:19
TSizeTy GetXDim() const
Definition: ds.h:2249
const int ExpTablePrecision
Definition: word2vec.h:12
const int TableSize
Definition: word2vec.h:13
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
int64 RndUnigramInt(TIntV &KTable, TFltV &UTable, TRnd &Rnd)
Definition: word2vec.cpp:56
const int NegSamN
Definition: word2vec.h:16