SNAP Library 2.2, Developer Reference
2014-03-11 19:15:55
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 #include "bd.h" 00002 00004 // Forward 00005 ClassHdTP(THtmlTok, PHtmlTok) 00006 ClassHdTP(THtmlDoc, PHtmlDoc) 00007 00009 // Html-Lexical-Chars 00010 typedef enum { 00011 hlctSpace, hlctAlpha, hlctNum, hlctSym, 00012 hlctLTag, hlctRTag, hlctEof} THtmlLxChTy; 00013 00014 ClassTP(THtmlLxChDef, PHtmlLxChDef)//{ 00015 private: 00016 TIntV ChTyV; 00017 TChV UcChV; 00018 TChV LcChV; 00019 TStrStrH EscStrH; 00020 void SetUcCh(const char& UcCh, const char& LcCh); 00021 void SetUcCh(const TStr& Str); 00022 void SetChTy(const THtmlLxChTy& ChTy, const TStr& Str); 00023 void SetEscStr(const TStr& SrcStr, const TStr& DstStr); 00024 public: 00025 THtmlLxChDef(); 00026 THtmlLxChDef(TSIn& SIn): ChTyV(SIn), UcChV(SIn), LcChV(SIn), EscStrH(SIn){} 00027 static PHtmlLxChDef Load(TSIn& SIn){return new THtmlLxChDef(SIn);} 00028 void Save(TSOut& SOut){ 00029 ChTyV.Save(SOut); UcChV.Save(SOut); LcChV.Save(SOut); EscStrH.Save(SOut);} 00030 00031 THtmlLxChDef& operator=(const THtmlLxChDef&){Fail; return *this;} 00032 00033 // character type operations 00034 int GetChTy(const char& Ch) const {return ChTyV[Ch-TCh::Mn];} 00035 bool IsEoln(const char& Ch) const {return (Ch==TCh::CrCh)||(Ch==TCh::LfCh);} 00036 bool IsWs(const char& Ch) const { 00037 return (Ch==' ')||(Ch==TCh::TabCh)||(Ch==TCh::CrCh)||(Ch==TCh::LfCh);} 00038 bool IsSpace(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctSpace;} 00039 bool IsAlpha(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctAlpha;} 00040 bool IsNum(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctNum;} 00041 bool IsAlNum(const char& Ch) const { 00042 return (int(ChTyV[Ch-TCh::Mn])==hlctAlpha)||(int(ChTyV[Ch-TCh::Mn])==hlctNum);} 00043 bool IsSym(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctSym;} 00044 bool IsUrl(const char& Ch) const { 00045 int ChTy=ChTyV[Ch-TCh::Mn]; 00046 return (ChTy==hlctAlpha)||(ChTy==hlctNum)|| 00047 (Ch=='.')||(Ch=='-')||(Ch==':')||(Ch=='/')||(Ch=='~');} 00048 00049 // upper/lower-case & escape-string operations 00050 bool IsUc(const char& Ch) const {return Ch==UcChV[Ch-TCh::Mn];} 00051 bool IsLc(const char& Ch) const {return Ch==LcChV[Ch-TCh::Mn];} 00052 char GetUc(const char& Ch) const {return UcChV[Ch-TCh::Mn];} 00053 char GetLc(const char& Ch) const {return LcChV[Ch-TCh::Mn];} 00054 void GetUcChA(TChA& ChA) const { 00055 for (int ChN=0; ChN<ChA.Len(); ChN++){ChA.PutCh(ChN, GetUc(ChA[ChN]));}} 00056 void GetLcChA(TChA& ChA) const { 00057 for (int ChN=0; ChN<ChA.Len(); ChN++){ChA.PutCh(ChN, GetLc(ChA[ChN]));}} 00058 TStr GetUcStr(const TStr& Str) const { 00059 TChA ChA(Str); GetUcChA(ChA); return ChA;} 00060 TStr GetLcStr(const TStr& Str) const { 00061 TChA ChA(Str); GetLcChA(ChA); return ChA;} 00062 TStr GetEscStr(const TStr& Str) const; 00063 00064 // standard entry points 00065 static PHtmlLxChDef ChDef; 00066 static PHtmlLxChDef GetChDef(){IAssert(!ChDef.Empty()); return ChDef;} 00067 static THtmlLxChDef& GetChDefRef(){IAssert(!ChDef.Empty()); return *ChDef;} 00068 00069 // character-set transformations 00070 static TStr GetCSZFromYuascii(const TChA& ChA); 00071 static TStr GetCSZFromWin1250(const TChA& ChA); 00072 static TStr GetWin1250FromYuascii(const TChA& ChA); 00073 static TStr GetIsoCeFromYuascii(const TChA& ChA); 00074 }; 00075 00077 // Html-Lexical 00078 typedef enum { 00079 hsyUndef, hsyStr, hsyNum, hsySSym, hsyUrl, 00080 hsyBTag, hsyETag, hsyMTag, hsyEof} THtmlLxSym; 00081 00082 class THtmlLx{ 00083 private: 00084 static THtmlLxChDef ChDef; 00085 PSIn SIn; 00086 TSIn& RSIn; 00087 bool DoParseArg; 00088 TChA ChStack; 00089 char Ch; 00090 int ChX; 00091 bool EscCh; 00092 TChA EscChA; 00093 TChA ArgNm; 00094 TChA ArgVal; 00095 void GetCh(){ 00096 if (ChStack.Empty()){ 00097 if (RSIn.Eof()){Ch=TCh::EofCh;} else {Ch=RSIn.GetCh(); ChX++;} 00098 } else { 00099 Ch=ChStack.Pop(); ChX++; 00100 } 00101 SymChA+=Ch; 00102 } 00103 void GetEscCh(); 00104 void GetMetaTag(); 00105 void GetTag(); 00106 public: 00107 THtmlLxSym Sym; 00108 int SymBChX, SymEChX; 00109 TChA ChA; 00110 TChA UcChA; 00111 TChA SymChA; 00112 int PreSpaces; 00113 TChA PreSpaceChA; 00114 typedef TStrKdV TArgNmValV; 00115 TArgNmValV ArgNmValV; 00116 public: 00117 THtmlLx(const PSIn& _SIn, const bool& _DoParseArg=true): 00118 SIn(_SIn), RSIn(*SIn), DoParseArg(_DoParseArg), 00119 ChStack(), Ch(' '), ChX(0), EscCh(false), 00120 EscChA(), ArgNm(), ArgVal(), 00121 Sym(hsyUndef), SymBChX(0), SymEChX(0), ChA(), UcChA(), 00122 PreSpaces(0), PreSpaceChA(), ArgNmValV(){} 00123 00124 THtmlLx& operator=(const THtmlLx&){Fail; return *this;} 00125 00126 void PutCh(const char& _Ch){ 00127 ChStack.Push(Ch); if (!SymChA.Empty()){SymChA.Pop();} Ch=_Ch; ChX--;} 00128 void PutStr(const TStr& Str){ 00129 for (int ChN=Str.Len()-1; ChN>=0; ChN--){PutCh(Str[ChN]);}} 00130 THtmlLxSym GetSym(); 00131 PHtmlTok GetTok(const bool& DoUc=true); 00132 TStr GetPreSpaceStr() const { 00133 return TStr::GetSpaceStr(PreSpaces);} 00134 00135 int GetArgs() const {return ArgNmValV.Len();} 00136 TStr GetArgNm(const int& ArgN) const {return ArgNmValV[ArgN].Key;} 00137 TStr GetArgVal(const int& ArgN) const {return ArgNmValV[ArgN].Dat;} 00138 bool IsArg(const TStr& ArgNm) const {return ArgNmValV.IsIn(TStrKd(ArgNm));} 00139 TStr GetArg(const TStr& ArgNm, const TStr& DfArgVal=TStr()) const { 00140 int ArgN=ArgNmValV.SearchForw(TStrKd(ArgNm)); 00141 if (ArgN==-1){return DfArgVal;} else {return ArgNmValV[ArgN].Dat;}} 00142 void PutArg(const TStr& ArgNm, const TStr& ArgVal){ 00143 int ArgN=ArgNmValV.SearchForw(TStrKd(ArgNm)); 00144 if (ArgN==-1){ArgNmValV.Add(TStrKd(ArgNm, ArgVal));} 00145 else {ArgNmValV[ArgN]=TStrKd(ArgNm, ArgVal);}} 00146 TStr GetFullBTagStr() const; 00147 00148 void MoveToStrOrEof(const TStr& Str); 00149 void MoveToBTagOrEof(const TStr& TagNm); 00150 void MoveToBTag2OrEof(const TStr& TagNm1, const TStr& TagNm2); 00151 void MoveToBTag3OrEof(const TStr& TagNm1, const TStr& TagNm2, const TStr& TagNm3); 00152 void MoveToBTagOrETagOrEof(const TStr& BTagNm, const TStr& ETagNm); 00153 void MoveToBTagArgOrEof( 00154 const TStr& TagNm, const TStr& ArgNm, const TStr& ArgVal); 00155 void MoveToBTagArg2OrEof(const TStr& TagNm, 00156 const TStr& ArgNm1, const TStr& ArgVal1, 00157 const TStr& ArgNm2, const TStr& ArgVal2, const bool& AndOpP=true); 00158 void MoveToBTagOrEof( 00159 const TStr& TagNm1, const TStr& ArgNm1, const TStr& ArgVal1, 00160 const TStr& TagNm2, const TStr& ArgNm2, const TStr& ArgVal2); 00161 void MoveToETagOrEof(const TStr& TagNm); 00162 TStr GetTextOnlyStrToEof(); 00163 TStr GetStrToBTag(const TStr& TagNm, const bool& TxtOnlyP=false); 00164 TStr GetStrToBTag(const TStr& TagNm, const TStr& ArgNm, 00165 const TStr& ArgVal, const bool& TxtOnlyP=false); 00166 TStr GetStrToETag(const TStr& TagNm, const bool& TxtOnlyP=false); 00167 TStr GetStrToETag2(const TStr& TagNm1, const TStr& TagNm2, const bool& TxtOnlyP=false); 00168 TStr GetStrInTag(const TStr& TagNm, const bool& TxtOnlyP=false); 00169 TStr GetHRefBeforeStr(const TStr& Str); 00170 bool IsGetBTag(const TStr& TagNm); 00171 bool IsGetETag(const TStr& TagNm); 00172 00173 static TStr GetSymStr(const THtmlLxSym& Sym); 00174 static TStr GetEscapedStr(const TChA& ChA); 00175 static TStr GetAsciiStr(const TChA& ChA, const char& GenericCh='_'); 00176 static void GetTokStrV(const TStr& Str, TStrV& TokStrV); 00177 static TStr GetNoTag(const TStr& Str); 00178 }; 00179 00181 // Html-Token 00182 ClassTPV(THtmlTok, PHtmlTok, THtmlTokV)//{ 00183 private: 00184 THtmlLxSym Sym; 00185 TStr Str; 00186 THtmlLx::TArgNmValV ArgNmValV; 00187 public: 00188 THtmlTok(): Sym(hsyUndef), Str(), ArgNmValV(){} 00189 THtmlTok(const THtmlLxSym& _Sym): 00190 Sym(_Sym), Str(), ArgNmValV(){} 00191 THtmlTok(const THtmlLxSym& _Sym, const TStr& _Str): 00192 Sym(_Sym), Str(_Str), ArgNmValV(){} 00193 THtmlTok(const THtmlLxSym& _Sym, const TStr& _Str, 00194 const THtmlLx::TArgNmValV& _ArgNmValV): 00195 Sym(_Sym), Str(_Str), ArgNmValV(_ArgNmValV){} 00196 THtmlTok(TSIn&){Fail;} 00197 static PHtmlTok Load(TSIn&){Fail; return NULL;} 00198 void Save(TSOut&){Fail;} 00199 00200 THtmlTok& operator=(const THtmlTok&){Fail; return *this;} 00201 00202 THtmlLxSym GetSym() const {return Sym;} 00203 TStr GetStr() const {return Str;} 00204 TStr GetFullStr() const; 00205 bool IsArg(const TStr& ArgNm) const { 00206 return ArgNmValV.SearchForw(TStrKd(ArgNm))!=-1;} 00207 TStr GetArg(const TStr& ArgNm) const { 00208 return ArgNmValV[ArgNmValV.SearchForw(TStrKd(ArgNm))].Dat;} 00209 TStr GetArg(const TStr& ArgNm, const TStr& DfArgVal) const { 00210 int ArgN=ArgNmValV.SearchForw(TStrKd(ArgNm)); 00211 if (ArgN==-1){return DfArgVal;} else {return ArgNmValV[ArgN].Dat;}} 00212 bool IsUrlTok(TStr& RelUrlStr) const; 00213 bool IsRedirUrlTok() const; 00214 00215 void SaveTxt(const PSOut& SOut, const bool& TxtMode=true); 00216 00217 static const TStr ATagNm; 00218 static const TStr AreaTagNm; 00219 static const TStr BrTagNm; 00220 static const TStr CardTagNm; 00221 static const TStr CenterTagNm; 00222 static const TStr FrameTagNm; 00223 static const TStr H1TagNm; 00224 static const TStr H2TagNm; 00225 static const TStr H3TagNm; 00226 static const TStr H4TagNm; 00227 static const TStr H5TagNm; 00228 static const TStr H6TagNm; 00229 static const TStr ImgTagNm; 00230 static const TStr LiTagNm; 00231 static const TStr MetaTagNm; 00232 static const TStr PTagNm; 00233 static const TStr UlTagNm; 00234 static const TStr TitleTagNm; 00235 static const TStr TitleETagNm; 00236 00237 static const TStr AltArgNm; 00238 static const TStr HRefArgNm; 00239 static const TStr SrcArgNm; 00240 static const TStr TitleArgNm; 00241 static const TStr HttpEquivArgNm; 00242 00243 static bool IsBreakTag(const TStr& TagNm); 00244 static bool IsBreakTok(const PHtmlTok& Tok); 00245 static bool IsHTag(const TStr& TagNm, int& HTagN); 00246 static PHtmlTok GetHTok(const bool& IsBTag, const int& HTagN); 00247 }; 00248 00250 // Html-Document 00251 typedef enum { 00252 hdtAll, hdtStr, hdtStrNum, hdtTag, hdtA, hdtHRef, hdtUL} THtmlDocType; 00253 00254 ClassTPV(THtmlDoc, PHtmlDoc, THtmlDocV)//{ 00255 private: 00256 THtmlTokV TokV; 00257 public: 00258 THtmlDoc(): TokV(){} 00259 THtmlDoc( 00260 const PSIn& SIn, const THtmlDocType& Type=hdtAll, const bool& DoUc=true); 00261 static PHtmlDoc New( 00262 const PSIn& SIn, const THtmlDocType& Type=hdtAll, const bool& DoUc=true){ 00263 return PHtmlDoc(new THtmlDoc(SIn, Type, DoUc));} 00264 THtmlDoc(TSIn&){Fail;} 00265 static PHtmlDoc Load(TSIn&){Fail; return NULL;} 00266 void Save(TSOut&){Fail;} 00267 00268 THtmlDoc& operator=(const THtmlDoc&){Fail; return *this;} 00269 00270 int GetToks() const {return TokV.Len();} 00271 PHtmlTok GetTok(const int& TokN) const {return TokV[TokN];} 00272 PHtmlTok GetTok(const int& TokN, THtmlLxSym& Sym, TStr& Str) const { 00273 Sym=TokV[TokN]->GetSym(); Str=TokV[TokN]->GetStr(); return TokV[TokN];} 00274 void AddTokV(const THtmlTokV& _TokV){TokV.AddV(_TokV);} 00275 00276 static TStr GetTxtLnDoc(const TStr& HtmlStr); 00277 static TStr GetTxtLnDoc(const TStr& HtmlStr, const TStr& BaseUrlStr, 00278 const bool& OutUrlP, const bool& OutTagsP); 00279 00280 static PHtmlDoc LoadTxt( 00281 const TStr& FNm, const THtmlDocType& Type=hdtAll, const bool& DoUc=true){ 00282 PSIn SIn=TFIn::New(FNm); return PHtmlDoc(new THtmlDoc(SIn, Type, DoUc));} 00283 void SaveTxt(const PSOut& SOut, const bool& TxtMode=true) const; 00284 00285 static void SaveHtmlToTxt( 00286 const TStr& HtmlStr, const PSOut& TxtSOut, const TStr& BaseUrlStr, 00287 const bool& OutUrlP, const bool& OutToksP); 00288 static void SaveHtmlToTxt( 00289 const TStr& HtmlStr, const TStr& TxtFNm, const TStr& BaseUrlStr, 00290 const bool& OutUrlP, const bool& OutToksP); 00291 static void SaveHtmlToXml( 00292 const TStr& HtmlStr, const PSOut& XmlSOut, const TStr& BaseUrlStr, 00293 const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP, 00294 const bool& OutTagsP, const bool& OutArgsP); 00295 static void SaveHtmlToXml( 00296 const TStr& HtmlStr, const TStr& XmlFNm, const TStr& BaseUrlStr, 00297 const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP, 00298 const bool& OutTagsP, const bool& OutArgsP); 00299 00300 static TLxSym GetLxSym(const THtmlLxSym& HtmlLxSym, const TChA& ChA); 00301 00302 static bool _IsTagRedir( 00303 const TStr& TagStr, const TStr& ArgNm, THtmlLx& Lx, 00304 const TStr& BaseUrlStr, const TStr& RedirUrlStr); 00305 static TStr GetRedirHtmlDocStr(const TStr& HtmlStr, 00306 const TStr& BaseUrlStr, const TStr& RedirUrlStr); 00307 }; 00308 00310 // Html-Hyper-Link-Document-Vector 00311 ClassTP(THtmlHldV, PHtmlHldV)//{ 00312 private: 00313 PHtmlDoc RefHtmlDoc; 00314 THtmlDocV HldV; 00315 public: 00316 THtmlHldV(const PHtmlDoc& _RefHtmlDoc, const int& HldWnLen=10); 00317 THtmlHldV(TSIn&){Fail;} 00318 static PHtmlHldV Load(TSIn&){Fail; return NULL;} 00319 void Save(TSOut&){Fail;} 00320 00321 THtmlHldV& operator=(const THtmlHldV&){Fail; return *this;} 00322 00323 PHtmlDoc GetRefHtmlDoc(){return RefHtmlDoc;} 00324 int GetHlds(){return HldV.Len();} 00325 PHtmlDoc GetHld(const int& HldN){return HldV[HldN];} 00326 }; 00327 00329 // Web-Page 00330 ClassTPV(TWebPg, PWebPg, TWebPgV)//{ 00331 private: 00332 TStrV UrlStrV; 00333 TStrV IpNumV; 00334 PHttpResp HttpResp; 00335 uint64 FetchMSecs; 00336 public: 00337 TWebPg(): UrlStrV(), IpNumV(), HttpResp(){} 00338 TWebPg(const TStrV& _UrlStrV, const TStrV& _IpNumV, const PHttpResp& _HttpResp): 00339 UrlStrV(_UrlStrV), IpNumV(_IpNumV), HttpResp(_HttpResp){} 00340 static PWebPg New(const TStrV& UrlStrV, const TStrV& IpNumV, const PHttpResp& HttpResp){ 00341 return new TWebPg(UrlStrV, IpNumV, HttpResp);} 00342 static PWebPg New(const TStrV& UrlStrV, const PHttpResp& HttpResp){ 00343 return new TWebPg(UrlStrV, TStrV(), HttpResp);} 00344 static PWebPg New(const TStr& UrlStr, const PHttpResp& HttpResp){ 00345 TStrV UrlStrV; UrlStrV.Add(UrlStr); 00346 return new TWebPg(UrlStrV, TStrV(), HttpResp);} 00347 ~TWebPg(){} 00348 TWebPg(TSIn&){Fail;} 00349 static PWebPg Load(TSIn&){Fail; return NULL;} 00350 void Save(TSOut&){Fail;} 00351 00352 TWebPg& operator=(const TWebPg&){Fail; return *this;} 00353 00354 int GetUrls() const {return UrlStrV.Len();} 00355 TStr GetUrlStr(const int& UrlN=-1) const { 00356 if (UrlN==-1){return UrlStrV.Last();} else {return UrlStrV[UrlN];}} 00357 PUrl GetUrl(const int& UrlN=-1) const { 00358 TStr UrlStr; 00359 if (UrlN==-1){UrlStr=UrlStrV.Last();} else {UrlStr=UrlStrV[UrlN];} 00360 return TUrl::New(UrlStr);} 00361 00362 int GetIps() const {return IpNumV.Len();} 00363 TStr GetIpNum(const int& IpN=-1) const { 00364 if (IpN==-1){return IpNumV.Last();} else {return IpNumV[IpN];}} 00365 00366 PHttpResp GetHttpResp() const {return HttpResp;} 00367 TStr GetHttpHdStr() const {return GetHttpResp()->GetHdStr();} 00368 TStr GetHttpBodyAsStr() const {return GetHttpResp()->GetBodyAsStr();} 00369 //void GetOutUrlStrV(TStrV& OutUrlStrV) const; 00370 void GetOutUrlV(TUrlV& OutUrlV, TUrlV& OutRedirUrlV) const; 00371 void GetOutUrlV(TUrlV& OutUrlV) const { 00372 TUrlV OutRedirUrlV; GetOutUrlV(OutUrlV, OutRedirUrlV);} 00373 void GetOutDescUrlStrKdV(TStrKdV& OutDescUrlStrKdV) const; 00374 00375 // fetch time 00376 void PutFetchMSecs(const uint64& _FetchMSecs){FetchMSecs=_FetchMSecs;} 00377 uint64 GetFetchMSecs() const {return FetchMSecs;} 00378 00379 void SaveAsHttpBody(const TStr& FNm) const; 00380 void SaveAsHttp(const TStr& FNm) const; 00381 00382 bool IsTxt() const; 00383 };