5   for (
int i = 1; i < CdfV.
Len(); i++) {
 
    6     CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; }
 
   11   for (
int i = 1; i < CdfV.
Len(); i++) {
 
   12     CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; }
 
   17   for (
int i = 1; i < CdfV.
Len(); i++) {
 
   18     CdfV[i].Dat = CdfV[i-1].Dat + CdfV[i].Dat; }
 
   35   for (
int i = CCdfV.
Len()-2; i >= 0; i--) {
 
   36     CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; }
 
   41   for (
int i = CCdfV.
Len()-2; i >= 0; i--) {
 
   42     CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; }
 
   47   for (
int i = CCdfV.
Len()-2; i >= 0; i--) {
 
   48     CCdfV[i].Dat = CCdfV[i+1].Dat + CCdfV[i].Dat; }
 
   65   for (
int i = PdfV.
Len()-1; i > 0; i--) {
 
   66     PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; }
 
   71   for (
int i = PdfV.
Len()-1; i > 0; i--) {
 
   72     PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; }
 
   77   for (
int i = PdfV.
Len()-1; i > 0; i--) {
 
   78     PdfV[i].Dat = PdfV[i].Dat - PdfV[i-1].Dat; }
 
   83   for (
int i = 0; i < PdfV.
Len(); i++) {
 
   84     Sum += PdfV[i].Val2; }
 
   85   if (Sum <= 0.0) { 
return; }
 
   86   for (
int i = 0; i < PdfV.
Len(); i++) {
 
   87     PdfV[i].Val2 /= Sum; }
 
   92   for (
int i = 0; i < PdfV.
Len(); i++) {
 
   94   if (Sum <= 0.0) { 
return; }
 
   95   for (
int i = 0; i < PdfV.
Len(); i++) {
 
  110   for (
int i = 0; i < YValV.
Len(); ) {
 
  111     ExpYValV.
Add(YValV[i]);
 
  112     i = int(i*BinFactor);
 
  113     if (i==prevI) { i++; }
 
  121   for (
int i = 0; i < YValV.
Len(); ) {
 
  122     ExpYValV.
Add(YValV[i]);
 
  123     i = int(i*BinFactor);
 
  124     if (i==prevI) { i++; }
 
  188   int EndSlash = UrlChA.
SearchCh(
'/', 7)-1; 
 
  190     const int BegSlash = UrlChA.
SearchChBack(
'/', EndSlash);
 
  191     if (BegSlash > 0) { 
return UrlChA.
GetSubStr(BegSlash+1, EndSlash).
ToLc(); }
 
  196     if (EndSlash > 0) { 
return UrlChA.
GetSubStr(0, EndSlash-1).
ToLc(); }
 
  197     else { 
return TChA(UrlChA).
ToLc(); }
 
  208   const char *c = Url.
CStr();
 
  210   while (*c && cnt != Count) {
 
  211     if (*c == Ch) { cnt++; }
 
  214   return int(c-Url.
CStr()-1);
 
  221   if (DomNm == 
"blog.myspace.com") {
 
  248   if (DomNm==
"blogs.msdn.com" || DomNm==
"ameblo.jp" || DomNm==
"xfruits.com" || DomNm==
"scienceblogs.com" || DomNm==
"blogs.sun.com" 
  249     || DomNm==
"blog.wired.com" || DomNm==
"weblogs.asp.net" || DomNm==
"blogs.technet.com" || DomNm==
"blogs.guardian.co" 
  250     || DomNm==
"blogs.clarin.com" || DomNm==
"blogs.sun.com" || DomNm==
"blog.wired.com" || DomNm==
"weblogs.asp.net" 
  251     || DomNm==
"blogs.technet.com" || DomNm==
"blogs.guardian.com" || DomNm==
"blogs.clarin.com" || DomNm==
"blogs.zdnet.com" 
  252     || DomNm==
"blogs.citypages.com" || DomNm==
"voices.washingtonpost.com" || DomNm==
"blog.tv2.dk" 
  253     || DomNm==
"blogs.menomoneefallsnow.com" || DomNm==
"weblogs.baltimoresun.com" || DomNm==
"eonline.com") {
 
  258   if (DomNm == 
"digg.com") {
 
  259     if (PostUrlStr.
IsPrefix(
"http://digg.com/submit?")) {
 
  260       const int Url = PostUrlStr.
SearchStr(
";url=");
 
  271   if (PostUrlStr.
IsPrefix(
"http://nydailynews.com/blogs/") || PostUrlStr.
IsPrefix(
"http://bbc.co.uk/blogs/")
 
  272     || PostUrlStr.
IsPrefix(
"http://nydailynews.com/blogs/") || PostUrlStr.
IsPrefix(
"http://newsbusters.org/blogs/")) {
 
  276   if (DomNm==
"feeds.feedburner.com") {
 
  280   if (DomNm==
"groups.google.com") {
 
  284   if (DomNm==
"news.google.com") { 
 
  285     const int UrlPos = PostUrlStr.
SearchStr(
"&url=");
 
  290   if (DomNm == 
"bloggrevyen.no") { 
 
  291     const int Http2 = PostUrlStr.
SearchStr(
"/http://");
 
  297   if (DomNm.
IsSuffix(
".rd.yahoo.com")) {
 
  298     const int Http2 = PostUrlStr.
SearchStr(
"/*");
 
  308   if (
StripEnd(UrlIn, 
"/", UrlOut)) {}
 
  309   else if (
StripEnd(UrlIn, 
"/index.html", UrlOut)) {}
 
  310   else if (
StripEnd(UrlIn, 
"/index.htm", UrlOut)) {}
 
  311   else if (
StripEnd(UrlIn, 
"/index.php", UrlOut)) {}
 
  319     if (UrlIn[0] != 
'/') { Out.
AddCh(
'/'); }
 
  324   if (UrlOut.
IsPrefix(
"http://www.")) {
 
  332   const int StrLen = Str.
Len();
 
  333   const int SearchStrLen = SearchStr.
Len();
 
  334   if (StrLen < SearchStrLen) { 
return false; }
 
  335   for (
int i = 0; i < SearchStrLen; i++) {
 
  336     if (Str[StrLen-i-1] != SearchStr[SearchStrLen-i-1]) { 
return false; }
 
  338   NewStr = Str.
GetSubStr(0, StrLen-SearchStrLen-1);
 
  343   if (LongStr.
Len() < MaxLen) { 
return LongStr; }
 
  351   char *b = (
char *) ChA.
CStr();
 
  353   if (*b == 0) { 
return TChA(); }
 
  361       OutChA += b;  OutChA.
AddCh(
' ');
 
  367   OutChA.DelLastCh();  OutChA.ToLc();
 
  373   char *b = (
char *) ChA.
CStr();
 
  375   if (*b == 0) { 
return TChA(); }
 
  380     while (*e && 
TCh::IsWs(*e)) { e++; ws=
true; }
 
  382     if (ws) { OutChA.
AddCh(
' '); ws=
false; }
 
  395   for (
const char *c = CStr; *c; c++) {
 
  406   for (
int w = 0; w < WrdV.
Len(); w++) {
 
  407     if (StopWordH.
IsKey(WrdV[w])) { SWordCnt++; }
 
  409   return WrdV.
Len() - SWordCnt;
 
  415   for (
char *c = (
char *) ChA.
CStr(); *c; c++) {
 
  416     if ((SplitOnWs && *c == 
' ') || (! SplitOnWs && ! 
TCh::IsAlNum(*c))) {
 
  428   for (
char *c = (
char *) ChA.
CStr(); *c; c++) {
 
  431       if (SkipEmpty && ! WrdV.
Empty() && strlen(WrdV.
Last()) == 0) { WrdV.
DelLast(); }
 
  435   if (SkipEmpty && ! WrdV.
Empty() && strlen(WrdV.
Last()) == 0) { WrdV.
DelLast(); }
 
  443   for (
char *c = (
char *) ChA.
CStr(); *c; c++) {
 
  445       if (c > ChA.
CStr() && *(c-1)==
'\r') { *(c-1)=0; } 
 
  448         if (IsChs) { LineV.
Add(c+1); }
 
  462   const char *B = ChA.
CStr();
 
  463   const char *E = B+ChA.
Len();
 
  464   char *c = (
char *) B;
 
  466   if (*c) { SentenceV.
Add(c); } 
else { 
return 0; }
 
  468     if (c<E && (*c == 
'.' || *c == 
'!' || *c == 
'?') && ! 
TCh::IsAlNum(*(c+1))) { 
 
  469       if (c<E && *(c+1)==
'"') { *c=
'"';  c++; } 
 
  470       if (c>=E) { 
continue; }
 
  473       while (e>B && *e!=
'"' && ! 
TCh::IsAlNum(*e)) { *e=0; e--; } 
 
  475       if (c<E) { SentenceV.
Add(c); }
 
  478   return SentenceV.
Len();
 
  499   StrB = (
char *) HtmlStr.
CStr();
 
  500   StrE = (
char *) StrB+HtmlStr.
Len(); 
 
  501   for (
char *e = StrB; e < StrE; ) {
 
  503     while (e<StrE && *e != 
'<') { e++; }
 
  506     TextStr+= b; TextStr.
AddCh(
' ');  *e = tmp;
 
  507     if (e >= StrE) { 
return; }
 
  509     if (e[1]==
'!' && e[2]==
'-' && e[3]==
'-') { 
 
  511       while(e<StrE && !(*(e-2)==
'-' && *(e-1)==
'-' && *e==
'>')) { e++; }
 
  515     if (e[1]==
's' && e[2]==
'c' && e[3]==
'r' && e[4]==
'i' && e[5]==
'p' && e[6]==
't') {
 
  517       while(e<StrE && !(*(e-6)==
's' && *(e-5)==
'c' && *(e-4)==
'r' && *(e-3)==
'i' && *(e-2)==
'p' && *(e-1)==
't' && *e==
'>')) { e++; }
 
  521     while (e < StrE && *e != '>
') { e++; } 
  522     if (e>=StrE) { return; } 
  527 bool TStrUtil::IsLatinStr(const TChA& Str, const double& MinAlFrac) { 
  528   int AlNumCnt=0, ChCnt=0; 
  529   for (const char *c = Str.CStr(); *c; c++) { 
  530     if (TCh::IsWs(*c)) { continue; } 
  531     if (*c > 0 && TCh::IsAlNum(*c)) { AlNumCnt++; } 
  534   if (double(AlNumCnt)/double(ChCnt) > MinAlFrac) { return true; } 
  538 void TStrUtil::GetWIdV(const TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) { 
  539   const int NotWId = -1; 
  543   TStrUtil::SplitWords(ChA, WrdV); 
  545   for (int w = 0; w < WrdV.Len(); w++) { 
  546     if (StrH.IsKeyGetDat(WrdV[w], WId)) { WIdV.Add(WId); } 
  547     else { WIdV.Add(NotWId); } 
  551 // and words to StrH and get a vector of word ids 
  552 void TStrUtil::GetAddWIdV(TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) { 
  556   TStrUtil::SplitWords(ChA, WrdV); 
  558   for (int w = 0; w < WrdV.Len(); w++) { 
  559     WIdV.Add(StrH.AddDatId(WrdV[w])); 
  563 // Parse time in various formats: 
  564 //   10:16, 16 Sep 2004 
  565 //   10:20, 2004 Sep 16 
  566 //   2005-07-07 20:30:35 
  567 //   23:24:07, 2005-07-10 
  569 //   21:16, July 9, 2005 
  570 //   06:02, 10 July 2005 
  571 bool TStrUtil::GetTmFromStr(const char* TmStr, TSecTm& Tm) { 
  572   static TStrV MonthV1, MonthV2; 
  573   if (MonthV1.Empty()) { 
  574     TStr("january|february|march|april|may|june|july|august|september|october|november|december").SplitOnAllCh('|
', MonthV1); 
  575     TStr("jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec").SplitOnAllCh('|
', MonthV2); 
  580   const char* End = Tmp.CStr()+Tmp.Len(); 
  581   int Col = -1, Cols=0; 
  582   for (char *b = Tmp.CStr(); b <End; ) { 
  584     while (*b && ! (*b==' ' || *b=='-
' || *b==':
' || *b==',
')) { b++; } 
  585     if (*b==':
') { if(Col==-1) { Col=WrdV.Len(); } Cols++;  } 
  587     while (*b && (*b==' ' || *b=='-
' || *b==':
' || *b==',
')) { b++; } 
  590     if (Col+1 >= WrdV.Len()) { return false; } 
  593   if (Col<1) { return false; } 
  594   const int Hr = atoi(WrdV[Col-1]); 
  595   const int Min = atoi(WrdV[Col]); 
  596   WrdV.Del(Col);  WrdV.Del(Col-1); 
  597   if (WrdV.Len() != 3) { return false; } 
  598   int y=0,m=1,d=2, Mon=-1; 
  599   if (TCh::IsAlpha(WrdV[0][0])) { 
  601   } else if (TCh::IsAlpha(WrdV[1][0])) { 
  603   } else if (TCh::IsAlpha(WrdV[2][0])) { 
  609   int Day = atoi(WrdV[d]); 
  610   if (Mon <= 0) { Mon = MonthV1.SearchForw(WrdV[m])+1; } 
  611   if (Mon <= 0) { Mon = MonthV2.SearchForw(WrdV[m])+1; } 
  612   if (Mon == 0) { return false; } 
  613   int Year = atoi(WrdV[y]); 
  614   if (Day > Year) { ::Swap(Day, Year); } 
  615   //printf("%d-%02d-%02d  %02d:%02d\n", Year, Mon, Day, Hr, Min); 
  616   Tm = TSecTm(Year, Mon, Day, Hr, Min, 0); 
  620 // Standardize first and lastnames into <last_name>_<first name innitial> 
  621 TStr TStrUtil::GetStdName(TStr AuthorName) { 
  624   AuthorName.ChangeChAll('\n
', ' '); 
  625   AuthorName.ChangeChAll('.
', ' '); 
  626   // if there is a number in the name, remove it and everything after it 
  628   while (pos<AuthorName.Len() && (AuthorName[pos]!='#
' && !TCh::IsNum(AuthorName[pos]))) { 
  630   if (pos < AuthorName.Len()) { 
  631     AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc(); } 
  632   if (AuthorName.Empty()) { return TStr::GetNullStr(); } 
  634   // replace everything after '(
' 
  635   int b = AuthorName.SearchCh('(
'); 
  637     AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc(); } 
  638   // skip if contains ')
' 
  639   if (AuthorName .SearchCh(')
')!=-1) { return TStr::GetNullStr(); } 
  640   // skip if it is not a name 
  641   if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1 
  642    || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) { 
  643     return TStr::GetNullStr(); 
  645   // remove all non-letters (latex tags, ...) 
  647   for (i = 0; i < AuthorName.Len(); i++) { 
  648     const char Ch = AuthorName[i]; 
  649     if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-
') { NewName += Ch; } 
  651   StdName = NewName;  StdName.ToTrunc(); 
  652   TStrV AuthNmV; StdName.SplitOnWs(AuthNmV); 
  653   // too short -- not a name 
  654   if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast(); 
  655   if (AuthNmV.Len() < 2) return TStr::GetNullStr(); 
  657   const TStr LastNm = AuthNmV.Last(); 
  658   if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr(); 
  660   IAssert(isalpha(AuthNmV[0][0])); 
  661   return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]); 
  664 void TStrUtil::GetStdNameV(TStr AuthorNames, TStrV& StdNameV) { 
  665   AuthorNames.ChangeChAll('\n
', ' '); 
  667   // split into author names 
  668   TStrV AuthV, TmpV, Tmp2V; 
  670   AuthorNames.SplitOnStr(" and ", TmpV); 
  672   for (i = 0; i < TmpV.Len(); i++) { 
  673     TmpV[i].SplitOnAllCh(',
', Tmp2V);  AuthV.AddV(Tmp2V); } 
  675   TmpV = AuthV;  AuthV.Clr(); 
  676   for (i = 0; i < TmpV.Len(); i++) { 
  677     TmpV[i].SplitOnAllCh('&
', Tmp2V);  AuthV.AddV(Tmp2V); } 
  679   TmpV = AuthV;  AuthV.Clr(); 
  680   for (i = 0; i < TmpV.Len(); i++) { 
  681     TmpV[i].SplitOnAllCh(',
', Tmp2V);  AuthV.AddV(Tmp2V); } 
  683   TmpV = AuthV;  AuthV.Clr(); 
  684   for (i = 0; i < TmpV.Len(); i++) { 
  685     TmpV[i].SplitOnAllCh(';
', Tmp2V);  AuthV.AddV(Tmp2V); } 
  688   //printf("\n*** %s\n", AuthorNames.CStr()); 
  689   for (i = 0; i < AuthV.Len(); i++) { 
  690     TStr StdName = GetStdName(AuthV[i]); 
  691     if (! StdName.Empty()) { 
  692       //printf("\t%s  ==>  %s\n", AuthV[i].CStr(), StdName.CStr()); 
  693       StdNameV.Add(StdName); 
  701 double TStopwatch::Tick() { 
  703   //return clock() / ((double)CLOCKS_PER_SEC); 
  707   return omp_get_wtime(); 
  713   return GetTickCount() / 1000.0; 
  717   struct rusage rusage; 
  719   getrusage(RUSAGE_SELF, &rusage); 
  725   ((float) (rusage.ru_utime.tv_usec + rusage.ru_stime.tv_usec) / 1000000) + 
  727   ((float) (rusage.ru_utime.tv_sec + rusage.ru_stime.tv_sec)); 
  733 void TStopwatch::Start(const TExperiment Exp) { 
  734   Starts[Exp] = Tick(); 
  737 void TStopwatch::Stop(const TExperiment Exp) { 
  738   double Duration = Tick() - Starts[Exp]; 
  739   Sums[Exp] += Duration; 
  740   Maxs[Exp] = Maxs[Exp] >= Duration ? Maxs[Exp] : Duration; 
  741   Mins[Exp] = Mins[Exp] <= Duration ? Mins[Exp] : Duration; 
  745 int TStopwatch::Cnt(const TExperiment Exp) const { 
  749 double TStopwatch::Sum(const TExperiment Exp) const { 
  753 double TStopwatch::Avg(const TExperiment Exp) const { 
  754   return Sums[Exp] / Cnts[Exp]; 
  757 double TStopwatch::Max(const TExperiment Exp) const { 
  761 double TStopwatch::Min(const TExperiment Exp) const { 
  768 #if defined(SW_WRITEN) 
  769 int WriteN(int fd, char *ptr, int nbytes) { 
  775     nwritten = (int) write(fd, ptr, nleft); 
  782   return (nbytes-nleft); 
static TChA GetDomNm(const TChA &UrlChA)
 
static bool GetNormalizedUrl(const TChA &UrlIn, const TChA &BaseUrl, TChA &UrlOut)
Quick URL nomalization: Remove ending /, /index.html, etc. and strip starting www. 
 
static void MakeExpBins(const TFltPrV &XYValV, TFltPrV &ExpXYValV, const double &BinFactor=2, const double &MinYVal=1)
 
static TChA GetWebsiteNm(const TChA &UrlChA)
 
static TChA GetDomNm2(const TChA &UrlChA)
 
void AddCh(const char &Ch, const int &MxLen=-1)
 
TSizeTy Len() const 
Returns the number of elements in the vector. 
 
static int SplitSentences(TChA &ChA, TVec< char * > &SentenceV)
 
static void GetXmlTagNmVal(TXmlLx &XmlLx, TChA &TagNm, TChA &TagVal)
 
static void MakeExpBins(const TFltPrV &XYValV, TFltPrV &ExpXYValV, const double &BinFactor=2, const double &MinYVal=1)
 
int SearchStr(const TChA &Str, const int &BChN=0) const 
 
static void RemoveHtmlTags(const TChA &HtmlStr, TChA &TextStr)
 
static bool GetXmlTagNmVal2(TXmlLx &XmlLx, TChA &TagNm, TChA &TagVal, const bool &TakeTagNms)
 
bool Empty() const 
Tests whether the vector is empty. 
 
static void GetPdf(const TIntPrV &CdfV, TIntPrV &PdfV)
 
static TChA GetShorStr(const TChA &LongStr, const int MaxLen=50)
 
int SearchChBack(const char &Ch, int BChN=-1) const 
 
static bool IsWs(const char &Ch)
 
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector. 
 
static int SplitLines(TChA &ChA, TVec< char * > &LineV, const bool &SkipEmpty=false)
 
bool IsKey(const char *Key) const 
 
bool IsPrefix(const char *CStr, const int &BChN=0) const 
 
static int CountWords(const char *CStr)
 
static int SplitOnCh(TChA &ChA, TVec< char * > &WrdV, const char &Ch, const bool &SkipEmpty=false)
 
static int SplitWords(TChA &ChA, TVec< char * > &WrdV, const bool &SplitOnWs=true)
 
static TChA GetCleanWrdStr(const TChA &ChA)
 
TChA GetSubStr(const int &BChN, const int &EChN) const 
 
const TVal & Last() const 
Returns a reference to the last element of the vector. 
 
static TChA & GetXmlTagVal(TXmlLx &XmlLx, const TChA &TagNm)
 
static void GetCdf(const TIntPrV &PdfV, TIntPrV &CdfV)
 
static void GetCCdf(const TIntPrV &PdfV, TIntPrV &CCdfV)
 
static bool IsAlNum(const char &Ch)
 
int SearchCh(const char &Ch, const int &BChN=0) const 
 
bool IsSuffix(const char *CStr) const 
 
static void Normalize(TFltPrV &PdfV)
 
#define EAssertR(Cond, MsgStr)
 
int GetNthOccurence(const TChA &Url, const int &Count, const char Ch='/')
 
static TChA GetCleanStr(const TChA &ChA)
 
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element. 
 
void DelLast()
Removes the last element of the vector. 
 
static bool StripEnd(const TChA &Str, const TChA &SearchStr, TChA &NewStr)