SNAP Library 2.2, Developer Reference
2014-03-11 19:15:55
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 00002 // Html-Lexical-Chars 00003 void THtmlLxChDef::SetUcCh(const char& UcCh, const char& LcCh){ 00004 // update upper-case (more lower cases may have one upper case) 00005 IAssert( 00006 (UcChV[LcCh-TCh::Mn]==TCh(0))|| 00007 (UcChV[LcCh-TCh::Mn]==TCh(LcCh))); 00008 UcChV[LcCh-TCh::Mn]=TCh(UcCh); 00009 // update lower-case (one upper case may have only one lower case) 00010 if ((LcChV[UcCh-TCh::Mn]==TCh(0))||(LcChV[UcCh-TCh::Mn]==TCh(UcCh))){ 00011 LcChV[UcCh-TCh::Mn]=TCh(LcCh); 00012 } 00013 } 00014 00015 void THtmlLxChDef::SetUcCh(const TStr& Str){ 00016 // set type of characters as letters 00017 SetChTy(hlctAlpha, Str); 00018 // first char in string is upper-case, rest are lower-case 00019 for (int ChN=1; ChN<Str.Len(); ChN++){ 00020 SetUcCh(Str[0], Str[ChN]); 00021 } 00022 } 00023 00024 void THtmlLxChDef::SetChTy(const THtmlLxChTy& ChTy, const TStr& Str){ 00025 for (int ChN=0; ChN<Str.Len(); ChN++){ 00026 ChTyV[Str[ChN]-TCh::Mn]=TInt(ChTy);} 00027 } 00028 00029 void THtmlLxChDef::SetEscStr(const TStr& SrcStr, const TStr& DstStr){ 00030 EscStrH.AddDat(SrcStr, DstStr); 00031 } 00032 00033 TStr THtmlLxChDef::GetEscStr(const TStr& Str) const { 00034 int EscStrId; 00035 if ((EscStrId=EscStrH.GetKeyId(Str))!=-1){ 00036 return EscStrH[EscStrId]; 00037 } else 00038 if ((Str.Len()>=2)&&(Str[0]=='&')&&(Str[1]=='#')){ 00039 int ChCd=0; 00040 for (int ChN=2; ChN<Str.Len(); ChN++){ 00041 if (ChCd<=0xFFFF){ChCd=ChCd*10+Str[ChN]-'0';}} 00042 return TStr((char)ChCd); 00043 } else { 00044 return TStr(' '); 00045 } 00046 } 00047 00048 THtmlLxChDef::THtmlLxChDef(): 00049 ChTyV(TCh::Vals), UcChV(TCh::Vals), LcChV(TCh::Vals), EscStrH(100){ 00050 00051 // Character-Types 00052 ChTyV.PutAll(TInt(hlctSpace)); 00053 SetChTy(hlctAlpha, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); 00054 SetChTy(hlctAlpha, "abcdefghijklmnopqrstuvwxyz"); 00055 SetChTy(hlctAlpha, "@_"); 00056 SetChTy(hlctNum, "0123456789"); 00057 SetChTy(hlctSym, "`~!#$%^&*()-=+[{]}\\|;:'\",<.>/?"); 00058 SetChTy(hlctLTag, "<"); SetChTy(hlctRTag, ">"); 00059 SetChTy(hlctEof, TStr(TCh::EofCh)); 00060 for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){ 00061 if ((Ch<0)||(127<Ch)){SetChTy(hlctAlpha, TStr(TCh(char(Ch))));}} 00062 //SetChTy(hlctSpace, TStr(TCh(char(160)))); 00063 00064 // Upper-Case 00065 {for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){ 00066 SetUcCh(char(Ch), char(Ch));}} 00067 SetUcCh("Aa"); SetUcCh("\xc0\xe0"); SetUcCh("\xc1\xe1"); SetUcCh("\xc2\xe2"); 00068 SetUcCh("\xc3\xe3"); SetUcCh("\xc4\xe4"); SetUcCh("\xc5\xe5"); SetUcCh("\xc6\xe6"); 00069 SetUcCh("Bb"); SetUcCh("Cc"); SetUcCh("\xc7\xe7"); SetUcCh("Dd"); 00070 SetUcCh("\xd0\xf0"); SetUcCh("Ee"); SetUcCh("\xc8\xe8"); SetUcCh("\xc9\xe9"); 00071 SetUcCh("\xca\xea"); SetUcCh("\xcb\xeb"); SetUcCh("Ff"); SetUcCh("Gg"); 00072 SetUcCh("Hh"); SetUcCh("Ii"); SetUcCh("\xcc\xec"); SetUcCh("\xcd\xed"); 00073 SetUcCh("\xce\xee"); SetUcCh("\xcf\xef"); SetUcCh("Jj"); SetUcCh("Kk"); 00074 SetUcCh("Ll"); SetUcCh("Mm"); SetUcCh("Nn"); SetUcCh("\xd1\xf1"); 00075 SetUcCh("Oo"); SetUcCh("\xd2\xf2"); SetUcCh("\xd3\xf3"); SetUcCh("\xd4\xf4"); 00076 SetUcCh("\xd5\xf5"); SetUcCh("\xd6\xf6"); SetUcCh("\xd8\xf8"); SetUcCh("Pp"); 00077 SetUcCh("Qq"); SetUcCh("Rr"); SetUcCh("Ss"); SetUcCh("\x8a\x9a"); 00078 SetUcCh("Tt"); SetUcCh("Uu"); SetUcCh("\xd9\xf9"); SetUcCh("\xda\xfa"); 00079 SetUcCh("\xdb\xfb"); SetUcCh("\xdc\xfc"); SetUcCh("Vv"); SetUcCh("Ww"); 00080 SetUcCh("Xx"); SetUcCh("Yy\xff"); SetUcCh("\xdd\xfd"); SetUcCh("Zz"); 00081 SetUcCh("\x8e\x9e"); 00082 // ISO-CE 00083 //SetUcCh(uchar(169), uchar(185)); /*Sh - \xa9\xb9*/ 00084 //SetUcCh(uchar(174), uchar(190)); /*Zh - \xae\xbe*/ 00085 //SetUcCh(uchar(200), uchar(232)); /*Ch - \xc8\xe8*/ 00086 //SetUcCh(uchar(198), uchar(230)); /*Cs - \xc6\xe6*/ 00087 //SetUcCh(uchar(208), uchar(240)); /*Dz - \xd0\xf0*/ 00088 00089 // Annoying Unicode-characters 00090 //SetChTy(hlctSpace, "\xc2\xef"); 00091 00092 // Escape-Sequences 00093 SetEscStr(""", "\""); SetEscStr("&", "&"); 00094 SetEscStr("<", "<"); SetEscStr(">", ">"); 00095 SetEscStr(" ", " "); 00096 00097 SetEscStr("ä", "\xe4"); SetEscStr("Ä", "\xc4"); 00098 SetEscStr("ö", "\xf6"); SetEscStr("Ö", "\xd6"); 00099 SetEscStr("ü", "\xfc"); SetEscStr("Ü", "\xdc"); 00100 SetEscStr("å", "\xe5"); SetEscStr("Å", "\xc5"); 00101 SetEscStr("ø", "\xf8"); SetEscStr("Ø", "\xd8"); 00102 SetEscStr("&Aelig", "\xc6"); SetEscStr("æ", "\xe6"); 00103 00104 SetEscStr("é", "e"); SetEscStr("É", "E"); 00105 SetEscStr("è", "e"); SetEscStr("È", "E"); 00106 SetEscStr("à", "a"); SetEscStr("À", "A"); 00107 } 00108 00109 PHtmlLxChDef THtmlLxChDef::ChDef=PHtmlLxChDef(new THtmlLxChDef()); 00110 00111 TStr THtmlLxChDef::GetCSZFromYuascii(const TChA& ChA){ 00112 TChA DstChA; 00113 for (int ChN=0; ChN<ChA.Len(); ChN++){ 00114 char Ch=ChA[ChN]; 00115 switch (Ch){ 00116 case '~': DstChA+='c'; break; 00117 case '^': DstChA+='C'; break; 00118 case '}': DstChA+='c'; break; 00119 case ']': DstChA+='C'; break; 00120 case '|': DstChA+='d'; break; 00121 case '\\': DstChA+='D'; break; 00122 case '{': DstChA+='s'; break; 00123 case '[': DstChA+='S'; break; 00124 case '`': DstChA+='z'; break; 00125 case '@': DstChA+='Z'; break; 00126 default: DstChA+=Ch; 00127 } 00128 } 00129 return DstChA; 00130 } 00131 00132 TStr THtmlLxChDef::GetCSZFromWin1250(const TChA& ChA){ 00133 TChA DstChA; 00134 for (int ChN=0; ChN<ChA.Len(); ChN++){ 00135 const uchar Ch=ChA[ChN]; 00136 switch (Ch){ 00137 case 232: DstChA+='c'; break; 00138 case 200: DstChA+='C'; break; 00139 case 154: DstChA+='s'; break; 00140 case 138: DstChA+='S'; break; 00141 case 158: DstChA+='z'; break; 00142 case 142: DstChA+='Z'; break; 00143 default: DstChA+=Ch; 00144 } 00145 } 00146 return DstChA; 00147 } 00148 00149 TStr THtmlLxChDef::GetWin1250FromYuascii(const TChA& ChA){ 00150 TChA DstChA; 00151 for (int ChN=0; ChN<ChA.Len(); ChN++){ 00152 char Ch=ChA[ChN]; 00153 switch (Ch){ 00154 case '~': DstChA+=uchar(232); break; 00155 case '^': DstChA+=uchar(200); break; 00156 case '}': DstChA+='c'; break; 00157 case ']': DstChA+='C'; break; 00158 case '|': DstChA+='d'; break; 00159 case '\\': DstChA+='D'; break; 00160 case '{': DstChA+=uchar(154); break; 00161 case '[': DstChA+=uchar(138); break; 00162 case '`': DstChA+=uchar(158); break; 00163 case '@': DstChA+=uchar(142); break; 00164 default: DstChA+=Ch; 00165 } 00166 } 00167 return DstChA; 00168 } 00169 00170 TStr THtmlLxChDef::GetIsoCeFromYuascii(const TChA& ChA){ 00171 TChA DstChA; 00172 for (int ChN=0; ChN<ChA.Len(); ChN++){ 00173 char Ch=ChA[ChN]; 00174 switch (Ch){ 00175 case '~': DstChA+=uchar(232); break; 00176 case '^': DstChA+=uchar(200); break; 00177 case '}': DstChA+=uchar(230); break; 00178 case ']': DstChA+=uchar(198); break; 00179 case '|': DstChA+=uchar(240); break; 00180 case '\\': DstChA+=uchar(208); break; 00181 case '{': DstChA+=uchar(185); break; 00182 case '[': DstChA+=uchar(169); break; 00183 case '`': DstChA+=uchar(190); break; 00184 case '@': DstChA+=uchar(174); break; 00185 default: DstChA+=Ch; 00186 } 00187 } 00188 return DstChA; 00189 } 00190 00192 // Html-Lexical 00193 THtmlLxChDef THtmlLx::ChDef; 00194 00195 void THtmlLx::GetEscCh(){ 00196 GetCh(); 00197 EscCh=(Ch=='&'); 00198 if (EscCh){ 00199 EscChA.Clr(); EscChA.AddCh(Ch); GetCh(); 00200 if (Ch=='#'){ 00201 EscChA.AddCh(Ch); GetCh(); 00202 if (('0'<=Ch)&&(Ch<='9')){ 00203 do {EscChA.AddCh(Ch); GetCh();} while (('0'<=Ch)&&(Ch<='9')); 00204 if (Ch==';'){GetCh();} 00205 PutStr(ChDef.GetEscStr(EscChA)); 00206 } else { 00207 PutCh('#'); PutCh('&'); 00208 } 00209 } else 00210 if ((('a'<=Ch)&&(Ch<='z'))||(('A'<=Ch)&&(Ch<='Z'))){ 00211 do { 00212 EscChA.AddCh(Ch); GetCh(); 00213 } while ((('A'<=Ch)&&(Ch<='Z'))||(('a'<=Ch)&&(Ch<='z'))||(('0'<=Ch)&&(Ch<='9'))); 00214 if (Ch==';'){ 00215 GetCh(); PutStr(ChDef.GetEscStr(EscChA)); 00216 } else { 00217 PutStr(EscChA); 00218 } 00219 } else { 00220 PutCh('&'); 00221 } 00222 } 00223 } 00224 00225 void THtmlLx::GetMetaTag(){ 00226 Sym=hsyMTag; 00227 if (Ch=='-'){ 00228 char PCh=' '; 00229 while ((Ch!=TCh::EofCh) && ((PCh!='-')||(Ch!='>'))){PCh=Ch; GetCh();} 00230 } else { 00231 while ((Ch!=TCh::EofCh) && (Ch!='>')){GetCh();} 00232 } 00233 if (Ch!=TCh::EofCh){GetEscCh();} 00234 } 00235 00236 void THtmlLx::GetTag(){ 00237 if (Ch=='/'){Sym=hsyETag; GetCh();} else {Sym=hsyBTag;} 00238 UcChA.AddCh('<'); 00239 while (ChDef.IsAlNum(Ch)||(Ch==':')){ 00240 UcChA.AddCh(ChDef.GetUc(Ch)); GetCh();} 00241 UcChA.AddCh('>'); 00242 ChA=UcChA; 00243 00244 if (DoParseArg){ 00245 while ((Ch!='>')&&(Ch!=TCh::EofCh)){ 00246 while ((!ChDef.IsAlpha(Ch))&&(Ch!='>')&&(Ch!=TCh::EofCh)){GetCh();} 00247 if (ChDef.IsAlpha(Ch)){ 00248 ArgNm.Clr(); ArgVal.Clr(); 00249 while (ChDef.IsAlNum(Ch)||(Ch=='-')){ArgNm.AddCh(ChDef.GetUc(Ch)); GetCh();} 00250 while (ChDef.IsWs(Ch)){GetCh();} 00251 if (Ch=='='){ 00252 GetCh(); while (ChDef.IsWs(Ch)){GetCh();} 00253 if (Ch=='"'){ 00254 GetCh(); 00255 while ((Ch!=TCh::EofCh)&&(Ch!='"')&&(Ch!='>')){ 00256 if (!ChDef.IsEoln(Ch)){ArgVal.AddCh(Ch);} GetCh();} 00257 if (Ch=='"'){GetCh();} 00258 } else if (Ch=='\''){ 00259 GetCh(); 00260 while ((Ch!=TCh::EofCh)&&(Ch!='\'')&&(Ch!='>')){ 00261 if (!ChDef.IsEoln(Ch)){ArgVal.AddCh(Ch);} GetCh();} 00262 if (Ch=='\''){GetCh();} 00263 } else { 00264 while ((!ChDef.IsWs(Ch))&&(Ch!='>')&&(Ch!=TCh::EofCh)){ 00265 ArgVal.AddCh(Ch); GetCh();} 00266 } 00267 ArgNmValV.Add(TStrKd(ArgNm, ArgVal)); 00268 } 00269 } 00270 } 00271 } else { 00272 while ((Ch!='>')&&(Ch!=TCh::EofCh)){GetCh();} 00273 } 00274 if (Ch!=TCh::EofCh){GetEscCh();} 00275 } 00276 00277 THtmlLxSym THtmlLx::GetSym(){ 00278 // prepare symbol descriptions 00279 ChA.Clr(); UcChA.Clr(); 00280 PreSpaces=0; PreSpaceChA.Clr(); 00281 ArgNmValV.Clr(); 00282 // skip white-space 00283 while (ChDef.IsSpace(Ch)){ 00284 if (ChX>0){PreSpaceChA+=Ch; PreSpaces++;} GetEscCh();} 00285 // parse symbol 00286 SymChA.Clr(); SymChA+=Ch; SymBChX=ChX; 00287 switch (ChDef.GetChTy(Ch)){ 00288 case hlctAlpha: 00289 Sym=hsyStr; 00290 forever{ 00291 do { 00292 ChA.AddCh(Ch); UcChA.AddCh(ChDef.GetUc(Ch)); GetEscCh(); 00293 } while (ChDef.IsAlNum(Ch)); 00294 if (Ch=='.'){ 00295 GetCh(); 00296 if (ChDef.IsAlNum(Ch)){ChA.AddCh('.'); UcChA.AddCh('.');} 00297 else {PutCh(Ch); Ch='.'; break;} 00298 } else {break;} 00299 } 00300 break; 00301 case hlctNum: 00302 Sym=hsyNum; 00303 forever{ 00304 do { 00305 ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh(); 00306 } while (ChDef.IsNum(Ch)); 00307 if (Ch=='.'){ 00308 GetCh(); 00309 if (ChDef.IsAlNum(Ch)){ChA.AddCh('.'); UcChA.AddCh('.');} 00310 else {PutCh(Ch); Ch='.'; break;} 00311 } else if (ChDef.IsAlpha(Ch)){ 00312 Sym=hsyStr; 00313 } else { 00314 break; 00315 } 00316 } 00317 break; 00318 case hlctSym: 00319 Sym=hsySSym; ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh(); 00320 if ((ChA.LastCh()=='.')&&(ChDef.IsAlNum(Ch))){ 00321 Sym=hsyStr; 00322 do { 00323 ChA.AddCh(Ch); UcChA.AddCh(ChDef.GetUc(Ch)); GetEscCh(); 00324 } while (ChDef.IsAlNum(Ch)); 00325 } 00326 break; 00327 case hlctLTag: 00328 if (EscCh){ 00329 Sym=hsySSym; ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh(); 00330 } else { 00331 GetCh(); 00332 if (Ch=='!'){GetCh(); GetMetaTag();} else {GetTag();} 00333 } 00334 break; 00335 case hlctRTag: 00336 if (EscCh){ 00337 Sym=hsySSym; ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh(); 00338 } else { 00339 Sym=hsySSym; ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh(); 00340 } 00341 break; 00342 case hlctEof: Sym=hsyEof; break; 00343 default: Sym=hsyUndef; GetEscCh(); 00344 } 00345 // set symbol last-character-position 00346 SymEChX=ChX-1; 00347 // delete last character 00348 if (!SymChA.Empty()){SymChA.Pop();} 00349 // return symbol 00350 return Sym; 00351 } 00352 00353 PHtmlTok THtmlLx::GetTok(const bool& DoUc){ 00354 if (DoUc){return PHtmlTok(new THtmlTok(Sym, UcChA, ArgNmValV));} 00355 else {return PHtmlTok(new THtmlTok(Sym, ChA, ArgNmValV));} 00356 } 00357 00358 TStr THtmlLx::GetFullBTagStr() const { 00359 IAssert(Sym==hsyBTag); 00360 TChA BTagChA; 00361 BTagChA+=ChA; BTagChA.Pop(); 00362 for (int ArgN=0; ArgN<GetArgs(); ArgN++){ 00363 BTagChA+=' '; BTagChA+=GetArgNm(ArgN); 00364 BTagChA+='='; BTagChA+='"'; BTagChA+=GetArgVal(ArgN); BTagChA+='"'; 00365 } 00366 BTagChA+='>'; 00367 return BTagChA; 00368 } 00369 00370 void THtmlLx::MoveToStrOrEof(const TStr& Str){ 00371 do { 00372 GetSym(); 00373 } while ((Sym!=hsyEof)&&((Sym!=hsyStr)||(ChA!=Str))); 00374 } 00375 00376 void THtmlLx::MoveToBTagOrEof(const TStr& TagNm){ 00377 do { 00378 GetSym(); 00379 } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||(UcChA!=TagNm))); 00380 } 00381 00382 void THtmlLx::MoveToBTag2OrEof(const TStr& TagNm1, const TStr& TagNm2){ 00383 do { 00384 GetSym(); 00385 } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||((UcChA!=TagNm1)&&(UcChA!=TagNm2)))); 00386 } 00387 00388 void THtmlLx::MoveToBTag3OrEof(const TStr& TagNm1, const TStr& TagNm2, const TStr& TagNm3){ 00389 do { 00390 GetSym(); 00391 } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||((UcChA!=TagNm1)&&(UcChA!=TagNm2)&&(UcChA!=TagNm3)))); 00392 } 00393 00394 void THtmlLx::MoveToBTagOrETagOrEof(const TStr& BTagNm, const TStr& ETagNm){ 00395 do { 00396 GetSym(); 00397 } while ((Sym!=hsyEof) && ((Sym!=hsyBTag)||(UcChA!=BTagNm)) && ((Sym!=hsyETag) || (UcChA!=ETagNm))); 00398 } 00399 00400 void THtmlLx::MoveToBTagArgOrEof( 00401 const TStr& TagNm, const TStr& ArgNm, const TStr& ArgVal){ 00402 forever { 00403 GetSym(); 00404 if (Sym==hsyEof){break;} 00405 if ((Sym==hsyBTag)&&(UcChA==TagNm)&& 00406 (IsArg(ArgNm))&&(GetArg(ArgNm)==ArgVal)){break;} 00407 } 00408 } 00409 00410 void THtmlLx::MoveToBTagArg2OrEof(const TStr& TagNm, 00411 const TStr& ArgNm1, const TStr& ArgVal1, 00412 const TStr& ArgNm2, const TStr& ArgVal2, const bool& AndOpP){ 00413 forever { 00414 GetSym(); 00415 if (Sym==hsyEof){break;} 00416 if (AndOpP){ 00417 if ((Sym==hsyBTag)&&(UcChA==TagNm)&& 00418 (IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1)&& 00419 (IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)){break;} 00420 } else { 00421 if ((Sym==hsyBTag)&&(UcChA==TagNm)&& 00422 (((IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1))|| 00423 ((IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)))){break;} 00424 } 00425 } 00426 } 00427 00428 void THtmlLx::MoveToBTagOrEof( 00429 const TStr& TagNm1, const TStr& ArgNm1, const TStr& ArgVal1, 00430 const TStr& TagNm2, const TStr& ArgNm2, const TStr& ArgVal2){ 00431 forever { 00432 GetSym(); 00433 if (Sym==hsyEof){break;} 00434 if ((Sym==hsyBTag)&&(UcChA==TagNm1)&& 00435 (IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1)){break;} 00436 if ((Sym==hsyBTag)&&(UcChA==TagNm2)&& 00437 (IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)){break;} 00438 } 00439 } 00440 00441 void THtmlLx::MoveToETagOrEof(const TStr& TagNm){ 00442 do { 00443 GetSym(); 00444 } while ((Sym!=hsyEof)&&((Sym!=hsyETag)||(UcChA!=TagNm))); 00445 } 00446 00447 TStr THtmlLx::GetTextOnlyStrToEof(){ 00448 TChA OutChA; 00449 forever { 00450 GetSym(); 00451 if (Sym==hsyEof){ 00452 break; 00453 } else { 00454 if (PreSpaces>0){OutChA+=' ';} 00455 if ((Sym!=hsyBTag)&&(Sym!=hsyETag)){ 00456 OutChA+=ChA;} 00457 } 00458 } 00459 return OutChA; 00460 } 00461 00462 TStr THtmlLx::GetStrToBTag(const TStr& TagNm, const bool& TxtOnlyP){ 00463 TChA OutChA; 00464 forever { 00465 GetSym(); 00466 if ((Sym==hsyEof)||((Sym==hsyBTag)&&(UcChA==TagNm))){ 00467 break; 00468 } else { 00469 if (PreSpaces>0){OutChA+=' ';} 00470 if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){ 00471 OutChA+=ChA;} 00472 } 00473 } 00474 return OutChA; 00475 } 00476 00477 TStr THtmlLx::GetStrToBTag(const TStr& TagNm, const TStr& ArgNm, 00478 const TStr& ArgVal, const bool& TxtOnlyP){ 00479 TChA OutChA; 00480 forever { 00481 GetSym(); 00482 if ((Sym==hsyEof)||((Sym==hsyBTag)&&(UcChA==TagNm)&& 00483 (IsArg(ArgNm))&&(GetArg(ArgNm)==ArgVal))){ 00484 break; 00485 } else { 00486 if (PreSpaces>0){OutChA+=' ';} 00487 if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){ 00488 OutChA+=ChA;} 00489 } 00490 } 00491 return OutChA; 00492 } 00493 00494 TStr THtmlLx::GetStrToETag(const TStr& TagNm, const bool& TxtOnlyP){ 00495 TChA OutChA; 00496 forever { 00497 GetSym(); 00498 if ((Sym==hsyEof)||((Sym==hsyETag)&&(UcChA==TagNm))){ 00499 break; 00500 } else { 00501 if (PreSpaces>0){OutChA+=' ';} 00502 if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){ 00503 OutChA+=ChA;} 00504 } 00505 } 00506 return OutChA; 00507 } 00508 00509 TStr THtmlLx::GetStrToETag2(const TStr& TagNm1, 00510 const TStr& TagNm2, const bool& TxtOnlyP){ 00511 TChA OutChA; 00512 forever { 00513 GetSym(); 00514 if ((Sym==hsyEof)||((Sym==hsyETag)&&(UcChA==TagNm1))||((Sym==hsyETag)&&(UcChA==TagNm2))){ 00515 break; 00516 } else { 00517 if (PreSpaces>0){OutChA+=' ';} 00518 if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){ 00519 OutChA+=ChA;} 00520 } 00521 } 00522 return OutChA; 00523 } 00524 00525 TStr THtmlLx::GetStrInTag(const TStr& TagNm, const bool& TxtOnlyP){ 00526 MoveToBTagOrEof(TagNm); 00527 return GetStrToETag(TagNm, TxtOnlyP); 00528 } 00529 00530 TStr THtmlLx::GetHRefBeforeStr(const TStr& Str){ 00531 TStr HRefStr; 00532 forever { 00533 GetSym(); 00534 if (Sym==hsyEof){HRefStr=""; break;} 00535 if ((Sym==hsyBTag)&&(UcChA=="<A>")){HRefStr=GetArg("HREF");} 00536 if ((Sym==hsyStr)&&(ChA==Str)){break;} 00537 } 00538 return HRefStr; 00539 } 00540 00541 bool THtmlLx::IsGetBTag(const TStr& TagNm){ 00542 if (GetSym()==hsyBTag){ 00543 return ChA==TagNm; 00544 } else {return false;} 00545 } 00546 00547 bool THtmlLx::IsGetETag(const TStr& TagNm){ 00548 if (GetSym()==hsyETag){ 00549 return ChA==TagNm; 00550 } else {return false;} 00551 } 00552 00553 TStr THtmlLx::GetSymStr(const THtmlLxSym& Sym){ 00554 switch (Sym){ 00555 case hsyUndef: return "Undef"; 00556 case hsyStr: return "Str"; 00557 case hsyNum: return "Num"; 00558 case hsySSym: return "SSym"; 00559 case hsyUrl: return "Url"; 00560 case hsyBTag: return "BTag"; 00561 case hsyETag: return "ETag"; 00562 case hsyMTag: return "MTag"; 00563 case hsyEof: return "Eof"; 00564 default: Fail; return TStr(); 00565 } 00566 } 00567 00568 TStr THtmlLx::GetEscapedStr(const TChA& ChA){ 00569 TChA EscapedChA; 00570 for (int ChN=0; ChN<ChA.Len(); ChN++){ 00571 char Ch=ChA[ChN]; 00572 switch (Ch){ 00573 case '"': EscapedChA+="""; break; 00574 case '&': EscapedChA+="&"; break; 00575 case '\'': EscapedChA+="'"; break; 00576 case '<': EscapedChA+="<"; break; 00577 case '>': EscapedChA+=">"; break; 00578 default: EscapedChA+=Ch; 00579 } 00580 } 00581 return EscapedChA; 00582 } 00583 00584 TStr THtmlLx::GetAsciiStr(const TChA& ChA, const char& GenericCh){ 00585 TChA AsciiChA; 00586 for (int ChN=0; ChN<ChA.Len(); ChN++){ 00587 char Ch=ChA[ChN]; 00588 if ((Ch<' ')||('~'<Ch)){ 00589 Ch=GenericCh;} 00590 AsciiChA+=Ch; 00591 } 00592 return AsciiChA; 00593 } 00594 00595 void THtmlLx::GetTokStrV(const TStr& Str, TStrV& TokStrV){ 00596 PSIn SIn=TStrIn::New(Str); 00597 THtmlLx Lx(SIn); 00598 Lx.GetSym(); 00599 TokStrV.Clr(); 00600 while (Lx.Sym!=hsyEof){ 00601 TokStrV.Add(Lx.ChA); 00602 Lx.GetSym(); 00603 } 00604 } 00605 00606 TStr THtmlLx::GetNoTag(const TStr& Str) { 00607 PSIn SIn=TStrIn::New(Str); 00608 THtmlLx Lx(SIn); 00609 Lx.GetSym(); 00610 TChA ChA; 00611 while (Lx.Sym!=hsyEof){ 00612 switch (Lx.Sym){ 00613 case hsyUndef: 00614 case hsyStr: 00615 case hsyNum: 00616 case hsySSym: 00617 if (Lx.PreSpaces > 0) { ChA += ' '; } 00618 ChA += Lx.ChA; 00619 default: break; 00620 } 00621 Lx.GetSym(); 00622 } 00623 return ChA; 00624 } 00625 00627 // Html-Token 00628 TStr THtmlTok::GetFullStr() const { 00629 if ((Sym==hsyBTag)&&(ArgNmValV.Len()>0)){ 00630 TChA FullChA; 00631 FullChA+=Str.GetSubStr(0, Str.Len()-2); 00632 for (int ArgNmValN=0; ArgNmValN<ArgNmValV.Len(); ArgNmValN++){ 00633 FullChA+=' '; FullChA+=ArgNmValV[ArgNmValN].Key; FullChA+='='; 00634 FullChA+='"'; FullChA+=ArgNmValV[ArgNmValN].Dat; FullChA+='"'; 00635 } 00636 FullChA+='>'; 00637 return FullChA; 00638 } else 00639 if (Sym==hsyETag){ 00640 TChA FullChA; 00641 FullChA+='<'; FullChA+='/'; FullChA+=Str.GetSubStr(1, Str.Len()-1); 00642 return FullChA; 00643 } else { 00644 return GetStr(); 00645 } 00646 } 00647 00648 bool THtmlTok::IsUrlTok(TStr& RelUrlStr) const { 00649 if (GetSym()==hsyBTag){ 00650 TStr TagNm=GetStr(); 00651 if ((TagNm==ATagNm)&&(IsArg(HRefArgNm))){ 00652 RelUrlStr=GetArg(HRefArgNm); return true;} 00653 else if ((TagNm==AreaTagNm)&&(IsArg(HRefArgNm))){ 00654 RelUrlStr=GetArg(HRefArgNm); return true;} 00655 else if ((TagNm==FrameTagNm)&&(IsArg(SrcArgNm))){ 00656 RelUrlStr=GetArg(SrcArgNm); return true;} 00657 else if ((TagNm==ImgTagNm)&&(IsArg(SrcArgNm))){ 00658 RelUrlStr=GetArg(SrcArgNm); return true;} 00659 else if ((TagNm==MetaTagNm)&&(IsArg(HttpEquivArgNm))){ 00660 TStr HttpEquivArgVal=GetArg(HttpEquivArgNm).GetUc(); 00661 if ((HttpEquivArgVal=="REFRESH")&&IsArg("CONTENT")){ 00662 TStr ContentStr=GetArg("CONTENT"); 00663 TStr LeftStr; TStr RightStr; TStr UrlEqStr="URL="; 00664 ContentStr.GetUc().SplitOnStr(LeftStr, UrlEqStr, RightStr); 00665 RelUrlStr=ContentStr.GetSubStr( 00666 LeftStr.Len()+UrlEqStr.Len(), ContentStr.Len()); 00667 return !RelUrlStr.Empty(); 00668 } else { 00669 return false; 00670 } 00671 } 00672 } 00673 return false; 00674 } 00675 00676 bool THtmlTok::IsRedirUrlTok() const { 00677 if (GetSym()==hsyBTag){ 00678 TStr TagNm=GetStr(); 00679 if ((TagNm==MetaTagNm)&&(IsArg(HttpEquivArgNm))){ 00680 TStr HttpEquivArgVal=GetArg(HttpEquivArgNm).GetUc(); 00681 if ((HttpEquivArgVal=="REFRESH")&&IsArg("CONTENT")){ 00682 return true; 00683 } else { 00684 return false; 00685 } 00686 } 00687 } 00688 return false; 00689 } 00690 00691 void THtmlTok::SaveTxt(const PSOut& SOut, const bool& TxtMode){ 00692 if (TxtMode){ 00693 SOut->PutStr(GetFullStr()); SOut->PutStr(" "); 00694 } else { 00695 SOut->PutStr(THtmlLx::GetSymStr(Sym)); SOut->PutStr(" "); 00696 SOut->PutStr(GetFullStr()); SOut->PutStr(" "); 00697 } 00698 } 00699 00700 const TStr THtmlTok::ATagNm="<A>"; 00701 const TStr THtmlTok::AreaTagNm="<AREA>"; 00702 const TStr THtmlTok::BrTagNm="<BR>"; 00703 const TStr THtmlTok::CardTagNm="<CARD>"; 00704 const TStr THtmlTok::CenterTagNm="<CENTER>"; 00705 const TStr THtmlTok::FrameTagNm="<FRAME>"; 00706 const TStr THtmlTok::H1TagNm="<H1>"; 00707 const TStr THtmlTok::H2TagNm="<H2>"; 00708 const TStr THtmlTok::H3TagNm="<H3>"; 00709 const TStr THtmlTok::H4TagNm="<H4>"; 00710 const TStr THtmlTok::H5TagNm="<H5>"; 00711 const TStr THtmlTok::H6TagNm="<H6>"; 00712 const TStr THtmlTok::ImgTagNm="<IMG>"; 00713 const TStr THtmlTok::LiTagNm="<LI>"; 00714 const TStr THtmlTok::MetaTagNm="<META>"; 00715 const TStr THtmlTok::PTagNm="<P>"; 00716 const TStr THtmlTok::UlTagNm="<UL>"; 00717 const TStr THtmlTok::TitleTagNm="<TITLE>"; 00718 const TStr THtmlTok::TitleETagNm="</TITLE>"; 00719 00720 const TStr THtmlTok::AltArgNm="ALT"; 00721 const TStr THtmlTok::HRefArgNm="HREF"; 00722 const TStr THtmlTok::SrcArgNm="SRC"; 00723 const TStr THtmlTok::TitleArgNm="TITLE"; 00724 const TStr THtmlTok::HttpEquivArgNm="HTTP-EQUIV"; 00725 00726 bool THtmlTok::IsBreakTag(const TStr& TagNm){ 00727 static TStrH BreakTagNmH(50); 00728 if (BreakTagNmH.Len()==0){ 00729 BreakTagNmH.AddKey(TStr("<H1>")); BreakTagNmH.AddKey(TStr("<H2>")); 00730 BreakTagNmH.AddKey(TStr("<H3>")); BreakTagNmH.AddKey(TStr("<H4>")); 00731 BreakTagNmH.AddKey(TStr("<H5>")); BreakTagNmH.AddKey(TStr("<H6>")); 00732 BreakTagNmH.AddKey(TStr("<BR>")); BreakTagNmH.AddKey(TStr("<HR>")); 00733 BreakTagNmH.AddKey(TStr("<P>")); BreakTagNmH.AddKey(TStr("<DL>")); 00734 BreakTagNmH.AddKey(TStr("<UL>")); BreakTagNmH.AddKey(TStr("<OL>")); 00735 BreakTagNmH.AddKey(TStr("<LI>")); BreakTagNmH.AddKey(TStr("<DT>")); 00736 BreakTagNmH.AddKey(TStr("<DD>")); BreakTagNmH.AddKey(TStr("<HEAD>")); 00737 BreakTagNmH.AddKey(TStr("<TITLE>")); BreakTagNmH.AddKey(TStr("<META>")); 00738 BreakTagNmH.AddKey(TStr("<SCRIPT>")); 00739 BreakTagNmH.AddKey(TStr("<HEAD>")); BreakTagNmH.AddKey(TStr("<BODY>")); 00740 } 00741 return BreakTagNmH.IsKey(TagNm); 00742 } 00743 00744 bool THtmlTok::IsBreakTok(const PHtmlTok& Tok){ 00745 if ((Tok->GetSym()==hsyBTag)||(Tok->GetSym()==hsyETag)){ 00746 return IsBreakTag(Tok->GetStr()); 00747 } else { 00748 return false; 00749 } 00750 } 00751 00752 bool THtmlTok::IsHTag(const TStr& TagNm, int& HTagN){ 00753 if ((TagNm.Len()==4)&&(TagNm[0]=='<')&&(TagNm[1]=='H')&&(TagNm[3]=='>')){ 00754 char Ch=TagNm[2]; 00755 if (('1'<=Ch)&&(Ch<='6')){HTagN=Ch-'0'; return true;} 00756 else {HTagN=-1; return false;} 00757 } else { 00758 HTagN=-1; return false; 00759 } 00760 } 00761 00762 PHtmlTok THtmlTok::GetHTok(const bool& IsBTag, const int& HTagN){ 00763 THtmlLxSym HTagSym=IsBTag?hsyBTag:hsyETag; 00764 TStr HTagNm; 00765 switch (HTagN){ 00766 case 1: HTagNm=H1TagNm; break; 00767 case 2: HTagNm=H2TagNm; break; 00768 case 3: HTagNm=H3TagNm; break; 00769 case 4: HTagNm=H4TagNm; break; 00770 case 5: HTagNm=H5TagNm; break; 00771 case 6: HTagNm=H6TagNm; break; 00772 default: Fail; 00773 } 00774 return PHtmlTok(new THtmlTok(HTagSym, HTagNm)); 00775 } 00776 00778 // Html-Document 00779 THtmlDoc::THtmlDoc(const PSIn& SIn, const THtmlDocType& Type, const bool& DoUc): 00780 TokV(1000, 0){ 00781 THtmlLx Lx(SIn); 00782 bool MkTok=false; bool InUL=false; 00783 while (Lx.GetSym()!=hsyEof){ 00784 switch (Type){ 00785 case hdtAll: MkTok=true; break; 00786 case hdtStr: MkTok=(Lx.Sym==hsyStr); break; 00787 case hdtStrNum: MkTok=(Lx.Sym==hsyStr)||(Lx.Sym==hsyNum); break; 00788 case hdtTag: MkTok=(Lx.Sym==hsyBTag)||(Lx.Sym==hsyETag); break; 00789 case hdtA: MkTok=(Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::ATagNm); break; 00790 case hdtHRef: 00791 MkTok=(Lx.Sym==hsyBTag)&& 00792 ((Lx.UcChA==THtmlTok::ATagNm)||(Lx.UcChA==THtmlTok::AreaTagNm)|| 00793 (Lx.UcChA==THtmlTok::FrameTagNm)||(Lx.UcChA==THtmlTok::ImgTagNm)|| 00794 (Lx.UcChA==THtmlTok::MetaTagNm)); 00795 break; 00796 case hdtUL: 00797 if ((Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=true;} 00798 MkTok=InUL; 00799 if ((Lx.Sym==hsyETag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=false;} 00800 break; 00801 default: Fail; 00802 } 00803 if (MkTok){TokV.Add(Lx.GetTok(DoUc));} 00804 } 00805 TokV.Add(PHtmlTok(new THtmlTok(hsyEof))); 00806 } 00807 00808 TStr THtmlDoc::GetTxtLnDoc(const TStr& HtmlStr){ 00809 TChA LnDocChA; 00810 // prepare html parsing 00811 PSIn HtmlSIn=TStrIn::New(HtmlStr); 00812 THtmlLx HtmlLx(HtmlSIn); 00813 bool InScript=false; 00814 // save text 00815 while (HtmlLx.GetSym()!=hsyEof){ 00816 TStr Str=HtmlLx.ChA; 00817 switch (HtmlLx.Sym){ 00818 case hsyStr: 00819 case hsyNum: 00820 case hsySSym: 00821 if (InScript){break;} 00822 if (HtmlLx.PreSpaces>0){LnDocChA+=' ';} 00823 LnDocChA+=Str.CStr(); 00824 break; 00825 case hsyBTag: 00826 if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';} 00827 if ((!InScript)&&(Str=="<SCRIPT>")){InScript=true;} 00828 break; 00829 case hsyETag: 00830 if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';} 00831 if ((InScript)&&(Str=="<SCRIPT>")){InScript=false;} 00832 break; 00833 default: break; 00834 } 00835 } 00836 // return result 00837 return LnDocChA; 00838 } 00839 00840 TStr THtmlDoc::GetTxtLnDoc(const TStr& HtmlStr, 00841 const TStr& BaseUrlStr, const bool& OutUrlP, const bool& OutTagsP){ 00842 // prepare output-string 00843 TChA OutChA; OutChA+=' '; 00844 // prepare html parsing 00845 PSIn HtmlSIn=TStrIn::New(HtmlStr); 00846 THtmlLx HtmlLx(HtmlSIn); 00847 bool InScript=false; 00848 // save text 00849 while (HtmlLx.GetSym()!=hsyEof){ 00850 TStr Str=HtmlLx.ChA; 00851 switch (HtmlLx.Sym){ 00852 case hsyUndef: 00853 case hsyUrl: 00854 case hsyMTag: 00855 break; 00856 case hsyStr: 00857 case hsyNum: 00858 case hsySSym: 00859 if (InScript){break;} 00860 if (HtmlLx.PreSpaces>0){if (OutChA.LastCh()!=' '){OutChA+=' ';}} 00861 OutChA+=Str; 00862 break; 00863 case hsyBTag: 00864 // extract tag name 00865 Str=Str.GetSubStr(1, Str.Len()-2); 00866 // process tag 00867 if (!InScript){ 00868 // check script tag 00869 if (Str=="SCRIPT"){ 00870 InScript=true; break;} 00871 // output tag 00872 if (OutTagsP){ 00873 OutChA+='<'; OutChA+=Str; OutChA+='>'; 00874 } else { 00875 if (OutChA.LastCh()!=' '){OutChA+=' ';} 00876 } 00877 // check if URL present 00878 PHtmlTok Tok=HtmlLx.GetTok(); 00879 TStr RelUrlStr; 00880 if (Tok->IsUrlTok(RelUrlStr)){ 00881 PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr); 00882 if (Url->IsOk()){ 00883 if (OutUrlP){ 00884 TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr()); 00885 OutChA+="<Url>"; OutChA+=XmlUrlStr; OutChA+="</Url>"; 00886 } 00887 } 00888 } 00889 } 00890 break; 00891 case hsyETag: 00892 // extract tag name 00893 Str=Str.GetSubStr(1, Str.Len()-2); 00894 // process tag 00895 if (InScript){ 00896 if (Str=="SCRIPT"){ 00897 InScript=false; break;} 00898 } else { 00899 if (OutTagsP){ 00900 OutChA+="</"; OutChA+=Str; OutChA+='>'; 00901 } else { 00902 if (OutChA.LastCh()!=' '){OutChA+=' ';} 00903 } 00904 } 00905 break; 00906 case hsyEof: break; 00907 default: Fail; 00908 } 00909 } 00910 // return string 00911 return OutChA; 00912 } 00913 00914 00915 void THtmlDoc::SaveTxt(const PSOut& SOut, const bool& TxtMode) const { 00916 if (TxtMode){ 00917 for (int TokN=0; TokN<TokV.Len(); TokN++){TokV[TokN]->SaveTxt(SOut);} 00918 SOut->PutLn(); 00919 } else { 00920 for (int TokN=0; TokN<TokV.Len(); TokN++){ 00921 SOut->PutStr(TInt::GetStr(TokN)); SOut->PutStr(": "); 00922 TokV[TokN]->SaveTxt(SOut); 00923 SOut->PutLn(); 00924 } 00925 } 00926 } 00927 00928 void THtmlDoc::SaveHtmlToTxt( 00929 const TStr& HtmlStr, const PSOut& TxtSOut, const TStr& BaseUrlStr, 00930 const bool& OutUrlP, const bool& OutTagsP){ 00931 // get text-string from html-string 00932 TStr TxtStr=GetTxtLnDoc(HtmlStr, BaseUrlStr, OutUrlP, OutTagsP); 00933 // save text-string 00934 TxtStr.SaveTxt(TxtSOut); 00935 } 00936 00937 void THtmlDoc::SaveHtmlToTxt( 00938 const TStr& HtmlStr, const TStr& TxtFNm, const TStr& BaseUrlStr, 00939 const bool& OutUrlP, const bool& OutTagsP){ 00940 // create output file 00941 PSOut TxtSOut=TFOut::New(TxtFNm); 00942 // save to output file 00943 SaveHtmlToTxt(HtmlStr, TxtSOut, BaseUrlStr, OutUrlP, OutTagsP); 00944 } 00945 00946 void THtmlDoc::SaveHtmlToXml( 00947 const TStr& HtmlStr, const PSOut& XmlSOut, const TStr& BaseUrlStr, 00948 const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP, 00949 const bool& OutTagsP, const bool& OutArgsP){ 00950 // prepare output-file-id 00951 TFileId fXml=XmlSOut->GetFileId(); 00952 // create outgoing url 00953 TStrV OutUrlStrV; 00954 // open top tag 00955 fprintf(fXml, "<HtmlDoc>\n"); 00956 // save url 00957 if (!BaseUrlStr.Empty()){ 00958 TStr XmlBaseUrlStr=TXmlLx::GetXmlStrFromPlainStr(BaseUrlStr); 00959 fprintf(fXml, "<BaseUrl>%s</BaseUrl>\n", XmlBaseUrlStr.CStr()); 00960 } 00961 // prepare html parsing 00962 PSIn HtmlSIn=TStrIn::New(HtmlStr); 00963 THtmlLx HtmlLx(HtmlSIn); 00964 TChA ContTextChA; bool InScript=false; 00965 // save text 00966 fprintf(fXml, "<Body>\n"); 00967 while (HtmlLx.GetSym()!=hsyEof){ 00968 TStr Str=HtmlLx.ChA; 00969 switch (HtmlLx.Sym){ 00970 case hsyUndef: 00971 case hsyUrl: 00972 case hsyMTag: 00973 break; 00974 case hsyStr: 00975 if (InScript){break;} 00976 Str=TXmlLx::GetXmlStrFromPlainStr(Str); 00977 if (OutToksP){ 00978 fprintf(fXml, " <Str>%s</Str>\n", Str.CStr());} 00979 if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str; 00980 break; 00981 case hsyNum: 00982 if (InScript){break;} 00983 Str=TXmlLx::GetXmlStrFromPlainStr(Str); 00984 if (OutToksP){ 00985 fprintf(fXml, " <Num>%s</Num>\n", Str.CStr());} 00986 if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str; 00987 break; 00988 case hsySSym: 00989 if (InScript){break;} 00990 Str=TXmlLx::GetXmlStrFromPlainStr(Str); 00991 if (OutToksP){ 00992 fprintf(fXml, " <Sym>%s</Sym>\n", Str.CStr());} 00993 if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str; 00994 break; 00995 case hsyBTag:{ 00996 // save continuos text 00997 if (!ContTextChA.Empty()){ 00998 if (OutTextP){ 00999 fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());} 01000 ContTextChA.Clr(); 01001 } 01002 // extract tag name 01003 Str=Str.GetSubStr(1, Str.Len()-2); 01004 Str=TXmlLx::GetXmlStrFromPlainStr(Str); 01005 // process tag 01006 if (!InScript){ 01007 // check script tag 01008 if (Str=="SCRIPT"){ 01009 InScript=true; break;} 01010 // output tag 01011 if (OutTagsP){ 01012 if (OutArgsP){ 01013 fprintf(fXml, " <BTag Nm=\"%s\">\n", Str.CStr()); 01014 for (int ArgN=0; ArgN<HtmlLx.GetArgs(); ArgN++){ 01015 TStr ArgNm=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgNm(ArgN)); 01016 TStr ArgVal=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgVal(ArgN)); 01017 fprintf(fXml, " <Arg Nm=\"%s\" Val=\"%s\"/>", ArgNm.CStr(), ArgVal.CStr()); 01018 } 01019 fprintf(fXml, " </BTag>\n"); 01020 } else { 01021 fprintf(fXml, " <BTag Nm=\"%s\"/>\n", Str.CStr()); 01022 } 01023 } 01024 // check if URL present 01025 PHtmlTok Tok=HtmlLx.GetTok(); 01026 TStr RelUrlStr; 01027 if (Tok->IsUrlTok(RelUrlStr)){ 01028 PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr); 01029 if (Url->IsOk()){ 01030 OutUrlStrV.Add(Url->GetUrlStr()); 01031 if (OutUrlP){ 01032 TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr()); 01033 fprintf(fXml, " <Url>%s</Url>\n", XmlUrlStr.CStr()); 01034 } 01035 } 01036 } 01037 } 01038 break;} 01039 case hsyETag:{ 01040 // save continuos text 01041 if (!ContTextChA.Empty()){ 01042 if (OutTextP){ 01043 fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());} 01044 ContTextChA.Clr(); 01045 } 01046 // extract tag name 01047 Str=Str.GetSubStr(1, Str.Len()-2); 01048 Str=TXmlLx::GetXmlStrFromPlainStr(Str); 01049 // process tag 01050 if (InScript){ 01051 if (Str=="SCRIPT"){ 01052 InScript=false; break;} 01053 } else { 01054 if (OutTagsP){ 01055 fprintf(fXml, " <ETag Nm=\"%s\"/>\n", Str.CStr());} 01056 } 01057 break;} 01058 case hsyEof: break; 01059 default: Fail; 01060 } 01061 } 01062 // save continuos text 01063 if (!ContTextChA.Empty()){ 01064 if (OutTextP){ 01065 fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());} 01066 ContTextChA.Clr(); 01067 } 01068 fprintf(fXml, "</Body>\n"); 01069 // save outgoing urls 01070 fprintf(fXml, "<OutUrls>\n"); 01071 for (int UrlN=0; UrlN<OutUrlStrV.Len(); UrlN++){ 01072 TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(OutUrlStrV[UrlN]); 01073 fprintf(fXml, " <Url N=\"%d\">%s</Url>\n", 1+UrlN, XmlUrlStr.CStr()); 01074 } 01075 fprintf(fXml, "</OutUrls>\n"); 01076 01077 // close top tag 01078 fprintf(fXml, "</HtmlDoc>\n"); 01079 } 01080 01081 void THtmlDoc::SaveHtmlToXml( 01082 const TStr& HtmlStr, const TStr& XmlFNm, const TStr& BaseUrlStr, 01083 const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP, 01084 const bool& OutTagsP, const bool& OutArgsP){ 01085 // create output file 01086 PSOut XmlSOut=TFOut::New(XmlFNm); 01087 // save to output file 01088 SaveHtmlToXml(HtmlStr, XmlSOut, BaseUrlStr, OutTextP, OutUrlP, 01089 OutToksP, OutTagsP, OutArgsP); 01090 } 01091 01092 TLxSym THtmlDoc::GetLxSym(const THtmlLxSym& HtmlLxSym, const TChA& ChA){ 01093 switch (HtmlLxSym){ 01094 case hsyUndef: return syUndef; 01095 case hsyStr: return syStr; 01096 case hsyNum: return syFlt; 01097 case hsySSym: return TLxSymStr::GetSSym(ChA); 01098 case hsyUrl: return syStr; 01099 case hsyBTag: return syStr; 01100 case hsyETag: return syStr; 01101 case hsyEof: return syEof; 01102 default: Fail; return syUndef; 01103 } 01104 } 01105 01106 bool THtmlDoc::_IsTagRedir( 01107 const TStr& TagStr, const TStr& ArgNm, THtmlLx& Lx, 01108 const TStr& BaseUrlStr, const TStr& RedirUrlStr){ 01109 IAssert(Lx.Sym==hsyBTag); 01110 if ((Lx.ChA==TagStr)&&(Lx.IsArg(ArgNm))){ 01111 TStr RelUrlStr=Lx.GetArg(ArgNm); 01112 PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr); 01113 if (Url->IsOk(usHttp)){ 01114 TStr UrlStr=Url->GetUrlStr(); 01115 PUrlEnv RedirUrlEnv=TUrlEnv::New(RedirUrlStr, "url", UrlStr); 01116 Lx.PutArg(ArgNm, RedirUrlEnv->GetFullUrlStr()); 01117 return true; 01118 } else { 01119 return false; 01120 } 01121 } else { 01122 return false; 01123 } 01124 } 01125 01126 TStr THtmlDoc::GetRedirHtmlDocStr(const TStr& HtmlStr, 01127 const TStr& BaseUrlStr, const TStr& RedirUrlStr){ 01128 PSIn SIn=TStrIn::New(HtmlStr); 01129 TMOut SOut; 01130 THtmlLx Lx(SIn); 01131 while (Lx.GetSym()!=hsyEof){ 01132 SOut.PutStr(Lx.PreSpaceChA); 01133 if ((Lx.Sym==hsyBTag)&&( 01134 (_IsTagRedir(THtmlTok::ATagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))|| 01135 (_IsTagRedir(THtmlTok::AreaTagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))|| 01136 (_IsTagRedir(THtmlTok::FrameTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr))|| 01137 (_IsTagRedir(THtmlTok::ImgTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr)))){ 01138 SOut.PutStr(Lx.GetFullBTagStr()); 01139 } else { 01140 SOut.PutStr(Lx.SymChA()); 01141 } 01142 } 01143 return SOut.GetAsStr(); 01144 } 01145 01147 // Html-Hyper-Link-Document-Vector 01148 THtmlHldV::THtmlHldV(const PHtmlDoc& _RefHtmlDoc, const int& HldWnLen): 01149 RefHtmlDoc(_RefHtmlDoc), HldV(){ 01150 bool IsTitleAct=false; THtmlTokV TitleTokV; 01151 bool IsHAct=false; int ActHTagN=-1; 01152 TVec<THtmlTokV> HTokV(6); 01153 PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr; 01154 for (int TokN=0; TokN<RefHtmlDoc->GetToks(); TokN++){ 01155 Tok=RefHtmlDoc->GetTok(TokN, TokSym, TokStr); 01156 if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){ 01157 // collect tokens before, inside and after <a> ... </a> tags 01158 int ATokN; PHtmlTok ATok; THtmlLxSym ATokSym; TStr ATokStr; 01159 // inside <A> tags 01160 THtmlTokV ATokV; ATokN=TokN; 01161 forever{ 01162 ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr); 01163 if (ATokSym!=hsySSym){ATokV.Add(ATok);} 01164 if ((ATokSym==hsyETag)&&(ATokStr==THtmlTok::ATagNm)){break;} 01165 ATokN++; 01166 if (ATokN>=RefHtmlDoc->GetToks()){break;} 01167 } 01168 int ETagATokN=ATokN+1; 01169 // before <A> tags 01170 THtmlTokV PrevATokV; ATokN=TokN; 01171 forever{ 01172 ATokN--; 01173 if (ATokN<0){break;} 01174 ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr); 01175 if (THtmlTok::IsBreakTok(ATok)){break;} 01176 if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){PrevATokV.Add(ATok);} 01177 if (ATokV.Len()>=HldWnLen){break;} 01178 } 01179 // after <A> tags 01180 THtmlTokV NextATokV; ATokN=ETagATokN; 01181 forever{ 01182 ATokN++; 01183 if (ATokN>=RefHtmlDoc->GetToks()){break;} 01184 ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr); 01185 if (THtmlTok::IsBreakTok(ATok)){break;} 01186 if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){NextATokV.Add(ATok);} 01187 if (ATokV.Len()>=HldWnLen){break;} 01188 } 01189 // construct html-document with hyper-link context 01190 PHtmlDoc HtmlDoc=PHtmlDoc(new THtmlDoc()); 01191 HtmlDoc->AddTokV(TitleTokV); 01192 for (int HTagN=1; HTagN<=6; HTagN++){HtmlDoc->AddTokV(HTokV[HTagN-1]);} 01193 HtmlDoc->AddTokV(PrevATokV); 01194 HtmlDoc->AddTokV(ATokV); 01195 HtmlDoc->AddTokV(NextATokV); 01196 HldV.Add(HtmlDoc); 01197 HtmlDoc->SaveTxt(TSOut::StdOut); 01198 } else 01199 if (TokSym==hsyBTag){ 01200 int HTagN; 01201 if (TokStr==THtmlTok::TitleTagNm){ 01202 IsTitleAct=true; TitleTokV.Clr(); TitleTokV.Add(Tok); 01203 } else 01204 if (THtmlTok::IsHTag(TokStr, HTagN)){ 01205 if (IsHAct){// conclude previous <H?> tag if left open 01206 HTokV[ActHTagN-1].Add(THtmlTok::GetHTok(false, ActHTagN));} 01207 IsHAct=true; ActHTagN=HTagN; 01208 {for (int HTagN=ActHTagN; HTagN<=6; HTagN++){HTokV[HTagN-1].Clr();}} 01209 HTokV[ActHTagN-1].Add(Tok); 01210 } 01211 } else 01212 if (TokSym==hsyETag){ 01213 int HTagN; 01214 if (TokStr==THtmlTok::TitleTagNm){ 01215 if (IsTitleAct){TitleTokV.Add(Tok); IsTitleAct=false;} 01216 } else 01217 if (THtmlTok::IsHTag(TokStr, HTagN)){ 01218 if (IsHAct){HTokV[ActHTagN-1].Add(Tok); IsHAct=false;} 01219 } 01220 } else 01221 if (TokSym!=hsySSym){ 01222 if (IsTitleAct){TitleTokV.Add(Tok);} 01223 if (IsHAct){HTokV[ActHTagN-1].Add(Tok);} 01224 } 01225 } 01226 } 01227 01229 // Web-Page 01230 void TWebPg::GetOutUrlV(TUrlV& OutUrlV, TUrlV& OutRedirUrlV) const { 01231 // create outgoing url vector 01232 OutUrlV.Clr(); OutRedirUrlV.Clr(); 01233 // take interesting web-page components 01234 TStr UrlStr=GetUrlStr(); 01235 TStr HtmlStr=GetHttpBodyAsStr(); 01236 // prepare html parsing 01237 PSIn HtmlSIn=TStrIn::New(HtmlStr); 01238 PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn); 01239 PHtmlTok Tok; 01240 // traverse html 01241 for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){ 01242 PHtmlTok Tok=HtmlDoc->GetTok(TokN); 01243 if (Tok->GetSym()==hsyBTag){ 01244 TStr RelUrlStr; 01245 if (Tok->IsUrlTok(RelUrlStr)){ 01246 PUrl Url=TUrl::New(RelUrlStr, UrlStr); 01247 if (Url->IsOk(usHttp)){ 01248 OutUrlV.Add(Url); 01249 if (Tok->IsRedirUrlTok()){ 01250 OutRedirUrlV.Add(Url); 01251 } 01252 } 01253 } 01254 } 01255 } 01256 } 01257 01258 void TWebPg::GetOutDescUrlStrKdV(TStrKdV& OutDescUrlStrKdV) const { 01259 // create outgoing url vector 01260 OutDescUrlStrKdV.Clr(); 01261 // take interesting web-page components 01262 TStr UrlStr=GetUrlStr(); 01263 TStr HtmlStr=GetHttpBodyAsStr(); 01264 // prepare html parsing 01265 PSIn HtmlSIn=TStrIn::New(HtmlStr); 01266 PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn); 01267 // traverse html documents 01268 PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr; 01269 int TokN=0; int Toks=HtmlDoc->GetToks(); 01270 while (TokN<Toks){ 01271 Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++; 01272 if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){ 01273 TStr RelUrlStr; 01274 if (Tok->IsUrlTok(RelUrlStr)){ 01275 PUrl Url=TUrl::New(RelUrlStr, UrlStr); 01276 if (Url->IsOk()){ 01277 TChA DescChA; 01278 while (TokN<Toks){ 01279 Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++; 01280 if ((TokSym==hsyETag)&&(TokStr==THtmlTok::ATagNm)){ 01281 break; 01282 } else { 01283 if ((TokSym==hsyStr)||(TokSym==hsyNum)||(TokSym==hsySSym)){ 01284 if (!DescChA.Empty()){DescChA+=' ';} 01285 DescChA+=TokStr; 01286 } 01287 } 01288 } 01289 OutDescUrlStrKdV.Add(TStrKd(DescChA, Url->GetUrlStr())); 01290 } 01291 } 01292 } 01293 } 01294 } 01295 01296 void TWebPg::SaveAsHttpBody(const TStr& FNm) const { 01297 // create output file 01298 PSOut SOut=TFOut::New(FNm); 01299 // save http-body 01300 HttpResp->SaveBody(SOut); 01301 } 01302 01303 void TWebPg::SaveAsHttp(const TStr& FNm) const { 01304 // create output file 01305 PSOut SOut=TFOut::New(FNm); 01306 // save http 01307 HttpResp->SaveTxt(SOut); 01308 } 01309 01310 bool TWebPg::IsTxt() const { 01311 if ((!HttpResp->IsContType())||HttpResp->IsContType(THttp::TextFldVal)){ 01312 TStr Str=HttpResp->GetBodyAsStr(); 01313 int StrLen=Str.Len(); int ChN=0; int PrintChs=0; 01314 while ((ChN<100)&&(ChN<StrLen)){ 01315 char Ch=Str[ChN++]; 01316 if (((' '<=Ch)&&(Ch<='~'))||(Ch==TCh::TabCh)||(Ch==TCh::LfCh)||(Ch==TCh::CrCh)){ 01317 PrintChs++;} 01318 } 01319 double PrintPrb=double(PrintChs)/double(ChN+1); 01320 return PrintPrb>0.9; 01321 } else { 01322 return false; 01323 } 01324 } 01325