SNAP Library 2.2, Developer Reference  2014-03-11 19:15:55
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
html.cpp
Go to the documentation of this file.
00001 
00002 // Html-Lexical-Chars
00003 void THtmlLxChDef::SetUcCh(const char& UcCh, const char& LcCh){
00004   // update upper-case (more lower cases may have one upper case)
00005   IAssert(
00006    (UcChV[LcCh-TCh::Mn]==TCh(0))||
00007    (UcChV[LcCh-TCh::Mn]==TCh(LcCh)));
00008   UcChV[LcCh-TCh::Mn]=TCh(UcCh);
00009   // update lower-case (one upper case may have only one lower case)
00010   if ((LcChV[UcCh-TCh::Mn]==TCh(0))||(LcChV[UcCh-TCh::Mn]==TCh(UcCh))){
00011     LcChV[UcCh-TCh::Mn]=TCh(LcCh);
00012   }
00013 }
00014 
00015 void THtmlLxChDef::SetUcCh(const TStr& Str){
00016   // set type of characters as letters
00017   SetChTy(hlctAlpha, Str);
00018   // first char in string is upper-case, rest are lower-case
00019   for (int ChN=1; ChN<Str.Len(); ChN++){
00020     SetUcCh(Str[0], Str[ChN]);
00021   }
00022 }
00023 
00024 void THtmlLxChDef::SetChTy(const THtmlLxChTy& ChTy, const TStr& Str){
00025   for (int ChN=0; ChN<Str.Len(); ChN++){
00026     ChTyV[Str[ChN]-TCh::Mn]=TInt(ChTy);}
00027 }
00028 
00029 void THtmlLxChDef::SetEscStr(const TStr& SrcStr, const TStr& DstStr){
00030   EscStrH.AddDat(SrcStr, DstStr);
00031 }
00032 
00033 TStr THtmlLxChDef::GetEscStr(const TStr& Str) const {
00034   int EscStrId;
00035   if ((EscStrId=EscStrH.GetKeyId(Str))!=-1){
00036     return EscStrH[EscStrId];
00037   } else
00038   if ((Str.Len()>=2)&&(Str[0]=='&')&&(Str[1]=='#')){
00039     int ChCd=0;
00040     for (int ChN=2; ChN<Str.Len(); ChN++){
00041       if (ChCd<=0xFFFF){ChCd=ChCd*10+Str[ChN]-'0';}}
00042     return TStr((char)ChCd);
00043   } else {
00044     return TStr(' ');
00045   }
00046 }
00047 
00048 THtmlLxChDef::THtmlLxChDef():
00049   ChTyV(TCh::Vals), UcChV(TCh::Vals), LcChV(TCh::Vals), EscStrH(100){
00050 
00051   // Character-Types
00052   ChTyV.PutAll(TInt(hlctSpace));
00053   SetChTy(hlctAlpha, "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
00054   SetChTy(hlctAlpha, "abcdefghijklmnopqrstuvwxyz");
00055   SetChTy(hlctAlpha, "@_");
00056   SetChTy(hlctNum, "0123456789");
00057   SetChTy(hlctSym, "`~!#$%^&*()-=+[{]}\\|;:'\",<.>/?");
00058   SetChTy(hlctLTag, "<"); SetChTy(hlctRTag, ">");
00059   SetChTy(hlctEof, TStr(TCh::EofCh));
00060   for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){
00061     if ((Ch<0)||(127<Ch)){SetChTy(hlctAlpha, TStr(TCh(char(Ch))));}}
00062   //SetChTy(hlctSpace, TStr(TCh(char(160))));
00063 
00064   // Upper-Case
00065   {for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){
00066     SetUcCh(char(Ch), char(Ch));}}
00067   SetUcCh("Aa"); SetUcCh("\xc0\xe0"); SetUcCh("\xc1\xe1"); SetUcCh("\xc2\xe2");
00068   SetUcCh("\xc3\xe3"); SetUcCh("\xc4\xe4"); SetUcCh("\xc5\xe5"); SetUcCh("\xc6\xe6");
00069   SetUcCh("Bb"); SetUcCh("Cc"); SetUcCh("\xc7\xe7"); SetUcCh("Dd");
00070   SetUcCh("\xd0\xf0"); SetUcCh("Ee"); SetUcCh("\xc8\xe8"); SetUcCh("\xc9\xe9");
00071   SetUcCh("\xca\xea"); SetUcCh("\xcb\xeb"); SetUcCh("Ff"); SetUcCh("Gg");
00072   SetUcCh("Hh"); SetUcCh("Ii"); SetUcCh("\xcc\xec"); SetUcCh("\xcd\xed");
00073   SetUcCh("\xce\xee"); SetUcCh("\xcf\xef"); SetUcCh("Jj"); SetUcCh("Kk");
00074   SetUcCh("Ll"); SetUcCh("Mm"); SetUcCh("Nn"); SetUcCh("\xd1\xf1");
00075   SetUcCh("Oo"); SetUcCh("\xd2\xf2"); SetUcCh("\xd3\xf3"); SetUcCh("\xd4\xf4");
00076   SetUcCh("\xd5\xf5"); SetUcCh("\xd6\xf6"); SetUcCh("\xd8\xf8"); SetUcCh("Pp");
00077   SetUcCh("Qq"); SetUcCh("Rr"); SetUcCh("Ss"); SetUcCh("\x8a\x9a");
00078   SetUcCh("Tt"); SetUcCh("Uu"); SetUcCh("\xd9\xf9"); SetUcCh("\xda\xfa");
00079   SetUcCh("\xdb\xfb"); SetUcCh("\xdc\xfc"); SetUcCh("Vv"); SetUcCh("Ww");
00080   SetUcCh("Xx"); SetUcCh("Yy\xff"); SetUcCh("\xdd\xfd"); SetUcCh("Zz");
00081   SetUcCh("\x8e\x9e");
00082   // ISO-CE
00083   //SetUcCh(uchar(169), uchar(185)); /*Sh - \xa9\xb9*/
00084   //SetUcCh(uchar(174), uchar(190)); /*Zh - \xae\xbe*/
00085   //SetUcCh(uchar(200), uchar(232)); /*Ch - \xc8\xe8*/
00086   //SetUcCh(uchar(198), uchar(230)); /*Cs - \xc6\xe6*/
00087   //SetUcCh(uchar(208), uchar(240)); /*Dz - \xd0\xf0*/
00088 
00089   // Annoying Unicode-characters
00090   //SetChTy(hlctSpace, "\xc2\xef");
00091 
00092   // Escape-Sequences
00093   SetEscStr("&quot", "\""); SetEscStr("&amp", "&");
00094   SetEscStr("&lt", "<"); SetEscStr("&gt", ">");
00095   SetEscStr("&nbsp", " ");
00096 
00097   SetEscStr("&auml", "\xe4"); SetEscStr("&Auml", "\xc4");
00098   SetEscStr("&ouml", "\xf6"); SetEscStr("&Ouml", "\xd6");
00099   SetEscStr("&uuml", "\xfc"); SetEscStr("&Uuml", "\xdc");
00100   SetEscStr("&aring", "\xe5"); SetEscStr("&Aring", "\xc5");
00101   SetEscStr("&oslash", "\xf8"); SetEscStr("&Oslash", "\xd8");
00102   SetEscStr("&Aelig", "\xc6"); SetEscStr("&aelig", "\xe6");
00103 
00104   SetEscStr("&eacute", "e"); SetEscStr("&Eacute", "E");
00105   SetEscStr("&egrave", "e"); SetEscStr("&Egrave", "E");
00106   SetEscStr("&agrave", "a"); SetEscStr("&Agrave", "A");
00107 }
00108 
00109 PHtmlLxChDef THtmlLxChDef::ChDef=PHtmlLxChDef(new THtmlLxChDef());
00110 
00111 TStr THtmlLxChDef::GetCSZFromYuascii(const TChA& ChA){
00112   TChA DstChA;
00113   for (int ChN=0; ChN<ChA.Len(); ChN++){
00114     char Ch=ChA[ChN];
00115     switch (Ch){
00116       case '~': DstChA+='c'; break;
00117       case '^': DstChA+='C'; break;
00118       case '}': DstChA+='c'; break;
00119       case ']': DstChA+='C'; break;
00120       case '|': DstChA+='d'; break;
00121       case '\\': DstChA+='D'; break;
00122       case '{': DstChA+='s'; break;
00123       case '[': DstChA+='S'; break;
00124       case '`': DstChA+='z'; break;
00125       case '@': DstChA+='Z'; break;
00126       default: DstChA+=Ch;
00127     }
00128   }
00129   return DstChA;
00130 }
00131 
00132 TStr THtmlLxChDef::GetCSZFromWin1250(const TChA& ChA){
00133   TChA DstChA;
00134   for (int ChN=0; ChN<ChA.Len(); ChN++){
00135     const uchar Ch=ChA[ChN];
00136     switch (Ch){
00137       case 232: DstChA+='c'; break;
00138       case 200: DstChA+='C'; break;
00139       case 154: DstChA+='s'; break;
00140       case 138: DstChA+='S'; break;
00141       case 158: DstChA+='z'; break;
00142       case 142: DstChA+='Z'; break;
00143       default: DstChA+=Ch;
00144     }
00145   }
00146   return DstChA;
00147 }
00148 
00149 TStr THtmlLxChDef::GetWin1250FromYuascii(const TChA& ChA){
00150   TChA DstChA;
00151   for (int ChN=0; ChN<ChA.Len(); ChN++){
00152     char Ch=ChA[ChN];
00153     switch (Ch){
00154       case '~': DstChA+=uchar(232); break;
00155       case '^': DstChA+=uchar(200); break;
00156       case '}': DstChA+='c'; break;
00157       case ']': DstChA+='C'; break;
00158       case '|': DstChA+='d'; break;
00159       case '\\': DstChA+='D'; break;
00160       case '{': DstChA+=uchar(154); break;
00161       case '[': DstChA+=uchar(138); break;
00162       case '`': DstChA+=uchar(158); break;
00163       case '@': DstChA+=uchar(142); break;
00164       default: DstChA+=Ch;
00165     }
00166   }
00167   return DstChA;
00168 }
00169 
00170 TStr THtmlLxChDef::GetIsoCeFromYuascii(const TChA& ChA){
00171   TChA DstChA;
00172   for (int ChN=0; ChN<ChA.Len(); ChN++){
00173     char Ch=ChA[ChN];
00174     switch (Ch){
00175       case '~': DstChA+=uchar(232); break;
00176       case '^': DstChA+=uchar(200); break;
00177       case '}': DstChA+=uchar(230); break;
00178       case ']': DstChA+=uchar(198); break;
00179       case '|': DstChA+=uchar(240); break;
00180       case '\\': DstChA+=uchar(208); break;
00181       case '{': DstChA+=uchar(185); break;
00182       case '[': DstChA+=uchar(169); break;
00183       case '`': DstChA+=uchar(190); break;
00184       case '@': DstChA+=uchar(174); break;
00185       default: DstChA+=Ch;
00186     }
00187   }
00188   return DstChA;
00189 }
00190 
00192 // Html-Lexical
00193 THtmlLxChDef THtmlLx::ChDef;
00194 
00195 void THtmlLx::GetEscCh(){
00196   GetCh();
00197   EscCh=(Ch=='&');
00198   if (EscCh){
00199     EscChA.Clr(); EscChA.AddCh(Ch); GetCh();
00200     if (Ch=='#'){
00201       EscChA.AddCh(Ch); GetCh();
00202       if (('0'<=Ch)&&(Ch<='9')){
00203         do {EscChA.AddCh(Ch); GetCh();} while (('0'<=Ch)&&(Ch<='9'));
00204         if (Ch==';'){GetCh();}
00205         PutStr(ChDef.GetEscStr(EscChA));
00206       } else {
00207         PutCh('#'); PutCh('&');
00208       }
00209     } else
00210     if ((('a'<=Ch)&&(Ch<='z'))||(('A'<=Ch)&&(Ch<='Z'))){
00211       do {
00212         EscChA.AddCh(Ch); GetCh();
00213       } while ((('A'<=Ch)&&(Ch<='Z'))||(('a'<=Ch)&&(Ch<='z'))||(('0'<=Ch)&&(Ch<='9')));
00214       if (Ch==';'){
00215         GetCh(); PutStr(ChDef.GetEscStr(EscChA));
00216       } else {
00217         PutStr(EscChA);
00218       }      
00219     } else {
00220       PutCh('&');
00221     }
00222   }
00223 }
00224 
00225 void THtmlLx::GetMetaTag(){
00226   Sym=hsyMTag;
00227   if (Ch=='-'){
00228     char PCh=' ';
00229     while ((Ch!=TCh::EofCh) && ((PCh!='-')||(Ch!='>'))){PCh=Ch; GetCh();}
00230   } else {
00231     while ((Ch!=TCh::EofCh) && (Ch!='>')){GetCh();}
00232   }
00233   if (Ch!=TCh::EofCh){GetEscCh();}
00234 }
00235 
00236 void THtmlLx::GetTag(){
00237   if (Ch=='/'){Sym=hsyETag; GetCh();} else {Sym=hsyBTag;}
00238   UcChA.AddCh('<');
00239   while (ChDef.IsAlNum(Ch)||(Ch==':')){
00240     UcChA.AddCh(ChDef.GetUc(Ch)); GetCh();}
00241   UcChA.AddCh('>');
00242   ChA=UcChA;
00243 
00244   if (DoParseArg){
00245     while ((Ch!='>')&&(Ch!=TCh::EofCh)){
00246       while ((!ChDef.IsAlpha(Ch))&&(Ch!='>')&&(Ch!=TCh::EofCh)){GetCh();}
00247       if (ChDef.IsAlpha(Ch)){
00248         ArgNm.Clr(); ArgVal.Clr();
00249         while (ChDef.IsAlNum(Ch)||(Ch=='-')){ArgNm.AddCh(ChDef.GetUc(Ch)); GetCh();}
00250         while (ChDef.IsWs(Ch)){GetCh();}
00251         if (Ch=='='){
00252           GetCh(); while (ChDef.IsWs(Ch)){GetCh();}
00253           if (Ch=='"'){
00254             GetCh();
00255             while ((Ch!=TCh::EofCh)&&(Ch!='"')&&(Ch!='>')){
00256               if (!ChDef.IsEoln(Ch)){ArgVal.AddCh(Ch);} GetCh();}
00257             if (Ch=='"'){GetCh();}
00258           } else if (Ch=='\''){
00259             GetCh();
00260             while ((Ch!=TCh::EofCh)&&(Ch!='\'')&&(Ch!='>')){
00261               if (!ChDef.IsEoln(Ch)){ArgVal.AddCh(Ch);} GetCh();}
00262             if (Ch=='\''){GetCh();}
00263           } else {
00264             while ((!ChDef.IsWs(Ch))&&(Ch!='>')&&(Ch!=TCh::EofCh)){
00265               ArgVal.AddCh(Ch); GetCh();}
00266           }
00267           ArgNmValV.Add(TStrKd(ArgNm, ArgVal));
00268         }
00269       }
00270     }
00271   } else {
00272     while ((Ch!='>')&&(Ch!=TCh::EofCh)){GetCh();}
00273   }
00274   if (Ch!=TCh::EofCh){GetEscCh();}
00275 }
00276 
00277 THtmlLxSym THtmlLx::GetSym(){
00278   // prepare symbol descriptions
00279   ChA.Clr(); UcChA.Clr();
00280   PreSpaces=0; PreSpaceChA.Clr();
00281   ArgNmValV.Clr();
00282   // skip white-space
00283   while (ChDef.IsSpace(Ch)){
00284     if (ChX>0){PreSpaceChA+=Ch; PreSpaces++;} GetEscCh();}
00285   // parse symbol
00286   SymChA.Clr(); SymChA+=Ch; SymBChX=ChX;
00287   switch (ChDef.GetChTy(Ch)){
00288     case hlctAlpha:
00289       Sym=hsyStr;
00290       forever{
00291         do {
00292           ChA.AddCh(Ch); UcChA.AddCh(ChDef.GetUc(Ch)); GetEscCh();
00293         } while (ChDef.IsAlNum(Ch));
00294         if (Ch=='.'){
00295           GetCh();
00296           if (ChDef.IsAlNum(Ch)){ChA.AddCh('.'); UcChA.AddCh('.');}
00297           else {PutCh(Ch); Ch='.'; break;}
00298         } else {break;}
00299       }
00300       break;
00301     case hlctNum:
00302       Sym=hsyNum;
00303       forever{
00304         do {
00305           ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh();
00306         } while (ChDef.IsNum(Ch));
00307         if (Ch=='.'){
00308           GetCh();
00309           if (ChDef.IsAlNum(Ch)){ChA.AddCh('.'); UcChA.AddCh('.');}
00310           else {PutCh(Ch); Ch='.'; break;}
00311         } else if (ChDef.IsAlpha(Ch)){
00312           Sym=hsyStr;
00313         } else {
00314           break;
00315         }
00316       }
00317       break;
00318     case hlctSym:
00319       Sym=hsySSym; ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh();
00320       if ((ChA.LastCh()=='.')&&(ChDef.IsAlNum(Ch))){
00321         Sym=hsyStr;
00322         do {
00323           ChA.AddCh(Ch); UcChA.AddCh(ChDef.GetUc(Ch)); GetEscCh();
00324         } while (ChDef.IsAlNum(Ch));
00325       }
00326       break;
00327     case hlctLTag:
00328       if (EscCh){
00329         Sym=hsySSym; ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh();
00330       } else {
00331         GetCh();
00332         if (Ch=='!'){GetCh(); GetMetaTag();} else {GetTag();}
00333       }
00334       break;
00335     case hlctRTag:
00336       if (EscCh){
00337         Sym=hsySSym; ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh();
00338       } else {
00339         Sym=hsySSym; ChA.AddCh(Ch); UcChA.AddCh(Ch);  GetEscCh();
00340       }
00341       break;
00342     case hlctEof: Sym=hsyEof; break;
00343     default: Sym=hsyUndef; GetEscCh();
00344   }
00345   // set symbol last-character-position
00346   SymEChX=ChX-1;
00347   // delete last character
00348   if (!SymChA.Empty()){SymChA.Pop();}
00349   // return symbol
00350   return Sym;
00351 }
00352 
00353 PHtmlTok THtmlLx::GetTok(const bool& DoUc){
00354   if (DoUc){return PHtmlTok(new THtmlTok(Sym, UcChA, ArgNmValV));}
00355   else {return PHtmlTok(new THtmlTok(Sym, ChA, ArgNmValV));}
00356 }
00357 
00358 TStr THtmlLx::GetFullBTagStr() const {
00359   IAssert(Sym==hsyBTag);
00360   TChA BTagChA;
00361   BTagChA+=ChA; BTagChA.Pop();
00362   for (int ArgN=0; ArgN<GetArgs(); ArgN++){
00363     BTagChA+=' '; BTagChA+=GetArgNm(ArgN);
00364     BTagChA+='='; BTagChA+='"'; BTagChA+=GetArgVal(ArgN); BTagChA+='"';
00365   }
00366   BTagChA+='>';
00367   return BTagChA;
00368 }
00369 
00370 void THtmlLx::MoveToStrOrEof(const TStr& Str){
00371   do {
00372     GetSym();
00373   } while ((Sym!=hsyEof)&&((Sym!=hsyStr)||(ChA!=Str)));
00374 }
00375 
00376 void THtmlLx::MoveToBTagOrEof(const TStr& TagNm){
00377   do {
00378     GetSym();
00379   } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||(UcChA!=TagNm)));
00380 }
00381 
00382 void THtmlLx::MoveToBTag2OrEof(const TStr& TagNm1, const TStr& TagNm2){
00383   do {
00384     GetSym();
00385   } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||((UcChA!=TagNm1)&&(UcChA!=TagNm2))));
00386 }
00387 
00388 void THtmlLx::MoveToBTag3OrEof(const TStr& TagNm1, const TStr& TagNm2, const TStr& TagNm3){
00389   do {
00390     GetSym();
00391   } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||((UcChA!=TagNm1)&&(UcChA!=TagNm2)&&(UcChA!=TagNm3))));
00392 }
00393 
00394 void THtmlLx::MoveToBTagOrETagOrEof(const TStr& BTagNm, const TStr& ETagNm){
00395   do {
00396     GetSym();
00397   } while ((Sym!=hsyEof) && ((Sym!=hsyBTag)||(UcChA!=BTagNm)) && ((Sym!=hsyETag) || (UcChA!=ETagNm)));
00398 }
00399 
00400 void THtmlLx::MoveToBTagArgOrEof(
00401  const TStr& TagNm, const TStr& ArgNm, const TStr& ArgVal){
00402   forever {
00403     GetSym();
00404     if (Sym==hsyEof){break;}
00405     if ((Sym==hsyBTag)&&(UcChA==TagNm)&&
00406      (IsArg(ArgNm))&&(GetArg(ArgNm)==ArgVal)){break;}
00407   }
00408 }
00409 
00410 void THtmlLx::MoveToBTagArg2OrEof(const TStr& TagNm,
00411  const TStr& ArgNm1, const TStr& ArgVal1,
00412  const TStr& ArgNm2, const TStr& ArgVal2, const bool& AndOpP){
00413   forever {
00414     GetSym();
00415     if (Sym==hsyEof){break;}
00416     if (AndOpP){
00417       if ((Sym==hsyBTag)&&(UcChA==TagNm)&&
00418        (IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1)&&
00419        (IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)){break;}
00420     } else {
00421       if ((Sym==hsyBTag)&&(UcChA==TagNm)&&
00422        (((IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1))||
00423         ((IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)))){break;}
00424     }
00425   }
00426 }
00427 
00428 void THtmlLx::MoveToBTagOrEof(
00429  const TStr& TagNm1, const TStr& ArgNm1, const TStr& ArgVal1,
00430  const TStr& TagNm2, const TStr& ArgNm2, const TStr& ArgVal2){
00431   forever {
00432     GetSym();
00433     if (Sym==hsyEof){break;}
00434     if ((Sym==hsyBTag)&&(UcChA==TagNm1)&&
00435      (IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1)){break;}
00436     if ((Sym==hsyBTag)&&(UcChA==TagNm2)&&
00437      (IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)){break;}
00438   }
00439 }
00440 
00441 void THtmlLx::MoveToETagOrEof(const TStr& TagNm){
00442   do {
00443     GetSym();
00444   } while ((Sym!=hsyEof)&&((Sym!=hsyETag)||(UcChA!=TagNm)));
00445 }
00446 
00447 TStr THtmlLx::GetTextOnlyStrToEof(){
00448   TChA OutChA;
00449   forever {
00450     GetSym();
00451     if (Sym==hsyEof){
00452       break;
00453     } else {
00454       if (PreSpaces>0){OutChA+=' ';}
00455       if ((Sym!=hsyBTag)&&(Sym!=hsyETag)){
00456         OutChA+=ChA;}
00457     }
00458   }
00459   return OutChA;
00460 }
00461 
00462 TStr THtmlLx::GetStrToBTag(const TStr& TagNm, const bool& TxtOnlyP){
00463   TChA OutChA;
00464   forever {
00465     GetSym();
00466     if ((Sym==hsyEof)||((Sym==hsyBTag)&&(UcChA==TagNm))){
00467       break;
00468     } else {
00469       if (PreSpaces>0){OutChA+=' ';}
00470       if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
00471         OutChA+=ChA;}
00472     }
00473   }
00474   return OutChA;
00475 }
00476 
00477 TStr THtmlLx::GetStrToBTag(const TStr& TagNm, const TStr& ArgNm,
00478  const TStr& ArgVal, const bool& TxtOnlyP){
00479   TChA OutChA;
00480   forever {
00481     GetSym();
00482     if ((Sym==hsyEof)||((Sym==hsyBTag)&&(UcChA==TagNm)&&
00483      (IsArg(ArgNm))&&(GetArg(ArgNm)==ArgVal))){
00484       break;
00485     } else {
00486       if (PreSpaces>0){OutChA+=' ';}
00487       if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
00488         OutChA+=ChA;}
00489     }
00490   }
00491   return OutChA;
00492 }
00493 
00494 TStr THtmlLx::GetStrToETag(const TStr& TagNm, const bool& TxtOnlyP){
00495   TChA OutChA;
00496   forever {
00497     GetSym();
00498     if ((Sym==hsyEof)||((Sym==hsyETag)&&(UcChA==TagNm))){
00499       break;
00500     } else {
00501       if (PreSpaces>0){OutChA+=' ';}
00502       if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
00503         OutChA+=ChA;}
00504     }
00505   }
00506   return OutChA;
00507 }
00508 
00509 TStr THtmlLx::GetStrToETag2(const TStr& TagNm1, 
00510  const TStr& TagNm2, const bool& TxtOnlyP){
00511   TChA OutChA;
00512   forever {
00513     GetSym();
00514     if ((Sym==hsyEof)||((Sym==hsyETag)&&(UcChA==TagNm1))||((Sym==hsyETag)&&(UcChA==TagNm2))){
00515       break;
00516     } else {
00517       if (PreSpaces>0){OutChA+=' ';}
00518       if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
00519         OutChA+=ChA;}
00520     }
00521   }
00522   return OutChA;
00523 }
00524 
00525 TStr THtmlLx::GetStrInTag(const TStr& TagNm, const bool& TxtOnlyP){
00526   MoveToBTagOrEof(TagNm);
00527   return GetStrToETag(TagNm, TxtOnlyP);
00528 }
00529 
00530 TStr THtmlLx::GetHRefBeforeStr(const TStr& Str){
00531   TStr HRefStr;
00532   forever {
00533     GetSym();
00534     if (Sym==hsyEof){HRefStr=""; break;}
00535     if ((Sym==hsyBTag)&&(UcChA=="<A>")){HRefStr=GetArg("HREF");}
00536     if ((Sym==hsyStr)&&(ChA==Str)){break;}
00537   }
00538   return HRefStr;
00539 }
00540 
00541 bool THtmlLx::IsGetBTag(const TStr& TagNm){
00542   if (GetSym()==hsyBTag){
00543     return ChA==TagNm;
00544   } else {return false;}
00545 }
00546 
00547 bool THtmlLx::IsGetETag(const TStr& TagNm){
00548   if (GetSym()==hsyETag){
00549     return ChA==TagNm;
00550   } else {return false;}
00551 }
00552 
00553 TStr THtmlLx::GetSymStr(const THtmlLxSym& Sym){
00554   switch (Sym){
00555     case hsyUndef: return "Undef";
00556     case hsyStr: return "Str";
00557     case hsyNum: return "Num";
00558     case hsySSym: return "SSym";
00559     case hsyUrl: return "Url";
00560     case hsyBTag: return "BTag";
00561     case hsyETag: return "ETag";
00562     case hsyMTag: return "MTag";
00563     case hsyEof: return "Eof";
00564     default: Fail; return TStr();
00565   }
00566 }
00567 
00568 TStr THtmlLx::GetEscapedStr(const TChA& ChA){
00569   TChA EscapedChA;
00570   for (int ChN=0; ChN<ChA.Len(); ChN++){
00571     char Ch=ChA[ChN];
00572     switch (Ch){
00573       case '"': EscapedChA+="&quot;"; break;
00574       case '&': EscapedChA+="&amp;"; break;
00575       case '\'': EscapedChA+="&apos;"; break;
00576       case '<': EscapedChA+="&lt;"; break;
00577       case '>': EscapedChA+="&gt;"; break;
00578       default: EscapedChA+=Ch;
00579     }
00580   }
00581   return EscapedChA;
00582 }
00583 
00584 TStr THtmlLx::GetAsciiStr(const TChA& ChA, const char& GenericCh){
00585   TChA AsciiChA;
00586   for (int ChN=0; ChN<ChA.Len(); ChN++){
00587     char Ch=ChA[ChN];
00588     if ((Ch<' ')||('~'<Ch)){
00589       Ch=GenericCh;}
00590     AsciiChA+=Ch;
00591   }
00592   return AsciiChA;
00593 }
00594 
00595 void THtmlLx::GetTokStrV(const TStr& Str, TStrV& TokStrV){
00596   PSIn SIn=TStrIn::New(Str);
00597   THtmlLx Lx(SIn);
00598   Lx.GetSym();
00599   TokStrV.Clr();
00600   while (Lx.Sym!=hsyEof){
00601     TokStrV.Add(Lx.ChA);
00602     Lx.GetSym();
00603   }
00604 }
00605 
00606 TStr THtmlLx::GetNoTag(const TStr& Str) {
00607   PSIn SIn=TStrIn::New(Str);
00608   THtmlLx Lx(SIn);
00609   Lx.GetSym();
00610   TChA ChA;
00611   while (Lx.Sym!=hsyEof){
00612     switch (Lx.Sym){
00613           case hsyUndef: 
00614           case hsyStr: 
00615           case hsyNum: 
00616           case hsySSym:
00617                 if (Lx.PreSpaces > 0) { ChA += ' '; }
00618                 ChA += Lx.ChA;
00619           default: break;
00620         }
00621         Lx.GetSym();
00622   }
00623   return ChA;
00624 }
00625 
00627 // Html-Token
00628 TStr THtmlTok::GetFullStr() const {
00629   if ((Sym==hsyBTag)&&(ArgNmValV.Len()>0)){
00630     TChA FullChA;
00631     FullChA+=Str.GetSubStr(0, Str.Len()-2);
00632     for (int ArgNmValN=0; ArgNmValN<ArgNmValV.Len(); ArgNmValN++){
00633       FullChA+=' '; FullChA+=ArgNmValV[ArgNmValN].Key; FullChA+='=';
00634       FullChA+='"'; FullChA+=ArgNmValV[ArgNmValN].Dat; FullChA+='"';
00635     }
00636     FullChA+='>';
00637     return FullChA;
00638   } else
00639   if (Sym==hsyETag){
00640     TChA FullChA;
00641     FullChA+='<'; FullChA+='/'; FullChA+=Str.GetSubStr(1, Str.Len()-1);
00642     return FullChA;
00643   } else {
00644     return GetStr();
00645   }
00646 }
00647 
00648 bool THtmlTok::IsUrlTok(TStr& RelUrlStr) const {
00649   if (GetSym()==hsyBTag){
00650     TStr TagNm=GetStr();
00651     if ((TagNm==ATagNm)&&(IsArg(HRefArgNm))){
00652       RelUrlStr=GetArg(HRefArgNm); return true;}
00653     else if ((TagNm==AreaTagNm)&&(IsArg(HRefArgNm))){
00654       RelUrlStr=GetArg(HRefArgNm); return true;}
00655     else if ((TagNm==FrameTagNm)&&(IsArg(SrcArgNm))){
00656       RelUrlStr=GetArg(SrcArgNm); return true;}
00657     else if ((TagNm==ImgTagNm)&&(IsArg(SrcArgNm))){
00658       RelUrlStr=GetArg(SrcArgNm); return true;}
00659     else if ((TagNm==MetaTagNm)&&(IsArg(HttpEquivArgNm))){
00660       TStr HttpEquivArgVal=GetArg(HttpEquivArgNm).GetUc();
00661       if ((HttpEquivArgVal=="REFRESH")&&IsArg("CONTENT")){
00662         TStr ContentStr=GetArg("CONTENT");
00663         TStr LeftStr; TStr RightStr; TStr UrlEqStr="URL=";
00664         ContentStr.GetUc().SplitOnStr(LeftStr, UrlEqStr, RightStr);
00665         RelUrlStr=ContentStr.GetSubStr(
00666          LeftStr.Len()+UrlEqStr.Len(), ContentStr.Len());
00667         return !RelUrlStr.Empty();
00668       } else {
00669         return false;
00670       }
00671     }
00672   }
00673   return false;
00674 }
00675 
00676 bool THtmlTok::IsRedirUrlTok() const {
00677   if (GetSym()==hsyBTag){
00678     TStr TagNm=GetStr();
00679     if ((TagNm==MetaTagNm)&&(IsArg(HttpEquivArgNm))){
00680       TStr HttpEquivArgVal=GetArg(HttpEquivArgNm).GetUc();
00681       if ((HttpEquivArgVal=="REFRESH")&&IsArg("CONTENT")){
00682         return true;
00683       } else {
00684         return false;
00685       }
00686     }
00687   }
00688   return false;
00689 }
00690 
00691 void THtmlTok::SaveTxt(const PSOut& SOut, const bool& TxtMode){
00692   if (TxtMode){
00693     SOut->PutStr(GetFullStr()); SOut->PutStr(" ");
00694   } else {
00695     SOut->PutStr(THtmlLx::GetSymStr(Sym)); SOut->PutStr(" ");
00696     SOut->PutStr(GetFullStr()); SOut->PutStr(" ");
00697   }
00698 }
00699 
00700 const TStr THtmlTok::ATagNm="<A>";
00701 const TStr THtmlTok::AreaTagNm="<AREA>";
00702 const TStr THtmlTok::BrTagNm="<BR>";
00703 const TStr THtmlTok::CardTagNm="<CARD>";
00704 const TStr THtmlTok::CenterTagNm="<CENTER>";
00705 const TStr THtmlTok::FrameTagNm="<FRAME>";
00706 const TStr THtmlTok::H1TagNm="<H1>";
00707 const TStr THtmlTok::H2TagNm="<H2>";
00708 const TStr THtmlTok::H3TagNm="<H3>";
00709 const TStr THtmlTok::H4TagNm="<H4>";
00710 const TStr THtmlTok::H5TagNm="<H5>";
00711 const TStr THtmlTok::H6TagNm="<H6>";
00712 const TStr THtmlTok::ImgTagNm="<IMG>";
00713 const TStr THtmlTok::LiTagNm="<LI>";
00714 const TStr THtmlTok::MetaTagNm="<META>";
00715 const TStr THtmlTok::PTagNm="<P>";
00716 const TStr THtmlTok::UlTagNm="<UL>";
00717 const TStr THtmlTok::TitleTagNm="<TITLE>";
00718 const TStr THtmlTok::TitleETagNm="</TITLE>";
00719 
00720 const TStr THtmlTok::AltArgNm="ALT";
00721 const TStr THtmlTok::HRefArgNm="HREF";
00722 const TStr THtmlTok::SrcArgNm="SRC";
00723 const TStr THtmlTok::TitleArgNm="TITLE";
00724 const TStr THtmlTok::HttpEquivArgNm="HTTP-EQUIV";
00725 
00726 bool THtmlTok::IsBreakTag(const TStr& TagNm){
00727   static TStrH BreakTagNmH(50);
00728   if (BreakTagNmH.Len()==0){
00729     BreakTagNmH.AddKey(TStr("<H1>")); BreakTagNmH.AddKey(TStr("<H2>"));
00730     BreakTagNmH.AddKey(TStr("<H3>")); BreakTagNmH.AddKey(TStr("<H4>"));
00731     BreakTagNmH.AddKey(TStr("<H5>")); BreakTagNmH.AddKey(TStr("<H6>"));
00732     BreakTagNmH.AddKey(TStr("<BR>")); BreakTagNmH.AddKey(TStr("<HR>"));
00733     BreakTagNmH.AddKey(TStr("<P>")); BreakTagNmH.AddKey(TStr("<DL>"));
00734     BreakTagNmH.AddKey(TStr("<UL>")); BreakTagNmH.AddKey(TStr("<OL>"));
00735     BreakTagNmH.AddKey(TStr("<LI>")); BreakTagNmH.AddKey(TStr("<DT>"));
00736     BreakTagNmH.AddKey(TStr("<DD>")); BreakTagNmH.AddKey(TStr("<HEAD>"));
00737     BreakTagNmH.AddKey(TStr("<TITLE>")); BreakTagNmH.AddKey(TStr("<META>"));
00738     BreakTagNmH.AddKey(TStr("<SCRIPT>"));
00739     BreakTagNmH.AddKey(TStr("<HEAD>")); BreakTagNmH.AddKey(TStr("<BODY>"));
00740   }
00741   return BreakTagNmH.IsKey(TagNm);
00742 }
00743 
00744 bool THtmlTok::IsBreakTok(const PHtmlTok& Tok){
00745   if ((Tok->GetSym()==hsyBTag)||(Tok->GetSym()==hsyETag)){
00746     return IsBreakTag(Tok->GetStr());
00747   } else {
00748     return false;
00749   }
00750 }
00751 
00752 bool THtmlTok::IsHTag(const TStr& TagNm, int& HTagN){
00753   if ((TagNm.Len()==4)&&(TagNm[0]=='<')&&(TagNm[1]=='H')&&(TagNm[3]=='>')){
00754     char Ch=TagNm[2];
00755     if (('1'<=Ch)&&(Ch<='6')){HTagN=Ch-'0'; return true;}
00756     else {HTagN=-1; return false;}
00757   } else {
00758     HTagN=-1; return false;
00759   }
00760 }
00761 
00762 PHtmlTok THtmlTok::GetHTok(const bool& IsBTag, const int& HTagN){
00763   THtmlLxSym HTagSym=IsBTag?hsyBTag:hsyETag;
00764   TStr HTagNm;
00765   switch (HTagN){
00766     case 1: HTagNm=H1TagNm; break;
00767     case 2: HTagNm=H2TagNm; break;
00768     case 3: HTagNm=H3TagNm; break;
00769     case 4: HTagNm=H4TagNm; break;
00770     case 5: HTagNm=H5TagNm; break;
00771     case 6: HTagNm=H6TagNm; break;
00772     default: Fail;
00773   }
00774   return PHtmlTok(new THtmlTok(HTagSym, HTagNm));
00775 }
00776 
00778 // Html-Document
00779 THtmlDoc::THtmlDoc(const PSIn& SIn, const THtmlDocType& Type, const bool& DoUc):
00780   TokV(1000, 0){
00781   THtmlLx Lx(SIn);
00782   bool MkTok=false; bool InUL=false;
00783   while (Lx.GetSym()!=hsyEof){
00784     switch (Type){
00785       case hdtAll: MkTok=true; break;
00786       case hdtStr: MkTok=(Lx.Sym==hsyStr); break;
00787       case hdtStrNum: MkTok=(Lx.Sym==hsyStr)||(Lx.Sym==hsyNum); break;
00788       case hdtTag: MkTok=(Lx.Sym==hsyBTag)||(Lx.Sym==hsyETag); break;
00789       case hdtA: MkTok=(Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::ATagNm); break;
00790       case hdtHRef:
00791         MkTok=(Lx.Sym==hsyBTag)&&
00792          ((Lx.UcChA==THtmlTok::ATagNm)||(Lx.UcChA==THtmlTok::AreaTagNm)||
00793          (Lx.UcChA==THtmlTok::FrameTagNm)||(Lx.UcChA==THtmlTok::ImgTagNm)||
00794          (Lx.UcChA==THtmlTok::MetaTagNm));
00795         break;
00796       case hdtUL:
00797         if ((Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=true;}
00798         MkTok=InUL;
00799         if ((Lx.Sym==hsyETag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=false;}
00800         break;
00801       default: Fail;
00802     }
00803     if (MkTok){TokV.Add(Lx.GetTok(DoUc));}
00804   }
00805   TokV.Add(PHtmlTok(new THtmlTok(hsyEof)));
00806 }
00807 
00808 TStr THtmlDoc::GetTxtLnDoc(const TStr& HtmlStr){
00809   TChA LnDocChA;
00810   // prepare html parsing
00811   PSIn HtmlSIn=TStrIn::New(HtmlStr);
00812   THtmlLx HtmlLx(HtmlSIn);
00813   bool InScript=false;
00814   // save text
00815   while (HtmlLx.GetSym()!=hsyEof){
00816     TStr Str=HtmlLx.ChA;
00817     switch (HtmlLx.Sym){
00818       case hsyStr:
00819       case hsyNum:
00820       case hsySSym:
00821         if (InScript){break;}
00822         if (HtmlLx.PreSpaces>0){LnDocChA+=' ';}
00823         LnDocChA+=Str.CStr();
00824         break;
00825       case hsyBTag:
00826         if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';}
00827         if ((!InScript)&&(Str=="<SCRIPT>")){InScript=true;}
00828         break;
00829       case hsyETag:
00830         if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';}
00831         if ((InScript)&&(Str=="<SCRIPT>")){InScript=false;}
00832         break;
00833       default: break;
00834     }
00835   }
00836   // return result
00837   return LnDocChA;
00838 }
00839 
00840 TStr THtmlDoc::GetTxtLnDoc(const TStr& HtmlStr, 
00841  const TStr& BaseUrlStr, const bool& OutUrlP, const bool& OutTagsP){
00842   // prepare output-string
00843   TChA OutChA; OutChA+=' ';
00844   // prepare html parsing
00845   PSIn HtmlSIn=TStrIn::New(HtmlStr);
00846   THtmlLx HtmlLx(HtmlSIn);
00847   bool InScript=false;
00848   // save text
00849   while (HtmlLx.GetSym()!=hsyEof){
00850     TStr Str=HtmlLx.ChA;
00851     switch (HtmlLx.Sym){
00852       case hsyUndef:
00853       case hsyUrl:
00854       case hsyMTag:
00855         break;
00856       case hsyStr:
00857       case hsyNum:
00858       case hsySSym:
00859         if (InScript){break;}
00860         if (HtmlLx.PreSpaces>0){if (OutChA.LastCh()!=' '){OutChA+=' ';}}
00861         OutChA+=Str;
00862         break;
00863       case hsyBTag:
00864         // extract tag name
00865         Str=Str.GetSubStr(1, Str.Len()-2);
00866         // process tag
00867         if (!InScript){
00868           // check script tag
00869           if (Str=="SCRIPT"){
00870             InScript=true; break;}
00871           // output tag
00872           if (OutTagsP){
00873             OutChA+='<'; OutChA+=Str; OutChA+='>';
00874           } else {
00875             if (OutChA.LastCh()!=' '){OutChA+=' ';}
00876           }
00877           // check if URL present
00878           PHtmlTok Tok=HtmlLx.GetTok();
00879           TStr RelUrlStr;
00880           if (Tok->IsUrlTok(RelUrlStr)){
00881             PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
00882             if (Url->IsOk()){
00883               if (OutUrlP){
00884                 TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr());
00885                 OutChA+="<Url>"; OutChA+=XmlUrlStr; OutChA+="</Url>";
00886               }
00887             }
00888           }
00889         }
00890         break;
00891       case hsyETag:
00892         // extract tag name
00893         Str=Str.GetSubStr(1, Str.Len()-2);
00894         // process tag
00895         if (InScript){
00896           if (Str=="SCRIPT"){
00897             InScript=false; break;}
00898         } else {
00899           if (OutTagsP){
00900             OutChA+="</"; OutChA+=Str; OutChA+='>';
00901           } else {
00902             if (OutChA.LastCh()!=' '){OutChA+=' ';}
00903           }
00904         }
00905         break;
00906       case hsyEof: break;
00907       default: Fail;
00908     }
00909   }
00910   // return string
00911   return OutChA;
00912 }
00913 
00914 
00915 void THtmlDoc::SaveTxt(const PSOut& SOut, const bool& TxtMode) const {
00916   if (TxtMode){
00917     for (int TokN=0; TokN<TokV.Len(); TokN++){TokV[TokN]->SaveTxt(SOut);}
00918     SOut->PutLn();
00919   } else {
00920     for (int TokN=0; TokN<TokV.Len(); TokN++){
00921       SOut->PutStr(TInt::GetStr(TokN)); SOut->PutStr(": ");
00922       TokV[TokN]->SaveTxt(SOut);
00923       SOut->PutLn();
00924     }
00925   }
00926 }
00927 
00928 void THtmlDoc::SaveHtmlToTxt(
00929  const TStr& HtmlStr, const PSOut& TxtSOut, const TStr& BaseUrlStr,
00930  const bool& OutUrlP, const bool& OutTagsP){
00931   // get text-string from html-string
00932   TStr TxtStr=GetTxtLnDoc(HtmlStr, BaseUrlStr, OutUrlP, OutTagsP);
00933   // save text-string
00934   TxtStr.SaveTxt(TxtSOut);
00935 }
00936 
00937 void THtmlDoc::SaveHtmlToTxt(
00938  const TStr& HtmlStr, const TStr& TxtFNm, const TStr& BaseUrlStr,
00939  const bool& OutUrlP, const bool& OutTagsP){
00940   // create output file
00941   PSOut TxtSOut=TFOut::New(TxtFNm);
00942   // save to output file
00943   SaveHtmlToTxt(HtmlStr, TxtSOut, BaseUrlStr, OutUrlP, OutTagsP);
00944 }
00945 
00946 void THtmlDoc::SaveHtmlToXml(
00947  const TStr& HtmlStr, const PSOut& XmlSOut, const TStr& BaseUrlStr,
00948  const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP,
00949  const bool& OutTagsP, const bool& OutArgsP){
00950   // prepare output-file-id
00951   TFileId fXml=XmlSOut->GetFileId();
00952   // create outgoing url
00953   TStrV OutUrlStrV;
00954   // open top tag
00955   fprintf(fXml, "<HtmlDoc>\n");
00956   // save url
00957   if (!BaseUrlStr.Empty()){
00958     TStr XmlBaseUrlStr=TXmlLx::GetXmlStrFromPlainStr(BaseUrlStr);
00959     fprintf(fXml, "<BaseUrl>%s</BaseUrl>\n", XmlBaseUrlStr.CStr());
00960   }
00961   // prepare html parsing
00962   PSIn HtmlSIn=TStrIn::New(HtmlStr);
00963   THtmlLx HtmlLx(HtmlSIn);
00964   TChA ContTextChA; bool InScript=false;
00965   // save text
00966   fprintf(fXml, "<Body>\n");
00967   while (HtmlLx.GetSym()!=hsyEof){
00968     TStr Str=HtmlLx.ChA;
00969     switch (HtmlLx.Sym){
00970       case hsyUndef:
00971       case hsyUrl:
00972       case hsyMTag:
00973         break;
00974       case hsyStr:
00975         if (InScript){break;}
00976         Str=TXmlLx::GetXmlStrFromPlainStr(Str);
00977         if (OutToksP){
00978           fprintf(fXml, "  <Str>%s</Str>\n", Str.CStr());}
00979         if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
00980         break;
00981       case hsyNum:
00982         if (InScript){break;}
00983         Str=TXmlLx::GetXmlStrFromPlainStr(Str);
00984         if (OutToksP){
00985           fprintf(fXml, "  <Num>%s</Num>\n", Str.CStr());}
00986         if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
00987         break;
00988       case hsySSym:
00989         if (InScript){break;}
00990         Str=TXmlLx::GetXmlStrFromPlainStr(Str);
00991         if (OutToksP){
00992           fprintf(fXml, "  <Sym>%s</Sym>\n", Str.CStr());}
00993         if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
00994         break;
00995       case hsyBTag:{
00996         // save continuos text
00997         if (!ContTextChA.Empty()){
00998           if (OutTextP){
00999             fprintf(fXml, "  <Text>%s</Text>\n", ContTextChA.CStr());}
01000           ContTextChA.Clr();
01001         }
01002         // extract tag name
01003         Str=Str.GetSubStr(1, Str.Len()-2);
01004         Str=TXmlLx::GetXmlStrFromPlainStr(Str);
01005         // process tag
01006         if (!InScript){
01007           // check script tag
01008           if (Str=="SCRIPT"){
01009             InScript=true; break;}
01010           // output tag
01011           if (OutTagsP){
01012             if (OutArgsP){
01013               fprintf(fXml, "  <BTag Nm=\"%s\">\n", Str.CStr());
01014               for (int ArgN=0; ArgN<HtmlLx.GetArgs(); ArgN++){
01015                 TStr ArgNm=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgNm(ArgN));
01016                 TStr ArgVal=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgVal(ArgN));
01017                 fprintf(fXml, "    <Arg Nm=\"%s\" Val=\"%s\"/>", ArgNm.CStr(), ArgVal.CStr());
01018               }
01019               fprintf(fXml, "  </BTag>\n");
01020             } else {
01021               fprintf(fXml, "  <BTag Nm=\"%s\"/>\n", Str.CStr());
01022             }
01023           }
01024           // check if URL present
01025           PHtmlTok Tok=HtmlLx.GetTok();
01026           TStr RelUrlStr;
01027           if (Tok->IsUrlTok(RelUrlStr)){
01028             PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
01029             if (Url->IsOk()){
01030               OutUrlStrV.Add(Url->GetUrlStr());
01031               if (OutUrlP){
01032                 TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr());
01033                 fprintf(fXml, "  <Url>%s</Url>\n", XmlUrlStr.CStr());
01034               }
01035             }
01036           }
01037         }
01038         break;}
01039       case hsyETag:{
01040         // save continuos text
01041         if (!ContTextChA.Empty()){
01042           if (OutTextP){
01043             fprintf(fXml, "  <Text>%s</Text>\n", ContTextChA.CStr());}
01044           ContTextChA.Clr();
01045         }
01046         // extract tag name
01047         Str=Str.GetSubStr(1, Str.Len()-2);
01048         Str=TXmlLx::GetXmlStrFromPlainStr(Str);
01049         // process tag
01050         if (InScript){
01051           if (Str=="SCRIPT"){
01052             InScript=false; break;}
01053         } else {
01054           if (OutTagsP){
01055             fprintf(fXml, "  <ETag Nm=\"%s\"/>\n", Str.CStr());}
01056         }
01057         break;}
01058       case hsyEof: break;
01059       default: Fail;
01060     }
01061   }
01062   // save continuos text
01063   if (!ContTextChA.Empty()){
01064     if (OutTextP){
01065       fprintf(fXml, "  <Text>%s</Text>\n", ContTextChA.CStr());}
01066     ContTextChA.Clr();
01067   }
01068   fprintf(fXml, "</Body>\n");
01069   // save outgoing urls
01070   fprintf(fXml, "<OutUrls>\n");
01071   for (int UrlN=0; UrlN<OutUrlStrV.Len(); UrlN++){
01072     TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(OutUrlStrV[UrlN]);
01073     fprintf(fXml, "  <Url N=\"%d\">%s</Url>\n", 1+UrlN, XmlUrlStr.CStr());
01074   }
01075   fprintf(fXml, "</OutUrls>\n");
01076 
01077   // close top tag
01078   fprintf(fXml, "</HtmlDoc>\n");
01079 }
01080 
01081 void THtmlDoc::SaveHtmlToXml(
01082  const TStr& HtmlStr, const TStr& XmlFNm, const TStr& BaseUrlStr,
01083  const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP,
01084  const bool& OutTagsP, const bool& OutArgsP){
01085   // create output file
01086   PSOut XmlSOut=TFOut::New(XmlFNm);
01087   // save to output file
01088   SaveHtmlToXml(HtmlStr, XmlSOut, BaseUrlStr, OutTextP, OutUrlP,
01089    OutToksP, OutTagsP, OutArgsP);
01090 }
01091 
01092 TLxSym THtmlDoc::GetLxSym(const THtmlLxSym& HtmlLxSym, const TChA& ChA){
01093   switch (HtmlLxSym){
01094     case hsyUndef: return syUndef;
01095     case hsyStr: return syStr;
01096     case hsyNum: return syFlt;
01097     case hsySSym: return TLxSymStr::GetSSym(ChA);
01098     case hsyUrl: return syStr;
01099     case hsyBTag: return syStr;
01100     case hsyETag: return syStr;
01101     case hsyEof: return syEof;
01102     default: Fail; return syUndef;
01103   }
01104 }
01105 
01106 bool THtmlDoc::_IsTagRedir(
01107  const TStr& TagStr, const TStr& ArgNm, THtmlLx& Lx,
01108  const TStr& BaseUrlStr, const TStr& RedirUrlStr){
01109   IAssert(Lx.Sym==hsyBTag);
01110   if ((Lx.ChA==TagStr)&&(Lx.IsArg(ArgNm))){
01111     TStr RelUrlStr=Lx.GetArg(ArgNm);
01112     PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
01113     if (Url->IsOk(usHttp)){
01114       TStr UrlStr=Url->GetUrlStr();
01115       PUrlEnv RedirUrlEnv=TUrlEnv::New(RedirUrlStr, "url", UrlStr);
01116       Lx.PutArg(ArgNm, RedirUrlEnv->GetFullUrlStr());
01117       return true;
01118     } else {
01119       return false;
01120     }
01121   } else {
01122     return false;
01123   }
01124 }
01125 
01126 TStr THtmlDoc::GetRedirHtmlDocStr(const TStr& HtmlStr,
01127  const TStr& BaseUrlStr, const TStr& RedirUrlStr){
01128   PSIn SIn=TStrIn::New(HtmlStr);
01129   TMOut SOut;
01130   THtmlLx Lx(SIn);
01131   while (Lx.GetSym()!=hsyEof){
01132     SOut.PutStr(Lx.PreSpaceChA);
01133     if ((Lx.Sym==hsyBTag)&&(
01134      (_IsTagRedir(THtmlTok::ATagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
01135      (_IsTagRedir(THtmlTok::AreaTagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
01136      (_IsTagRedir(THtmlTok::FrameTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr))||
01137      (_IsTagRedir(THtmlTok::ImgTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr)))){
01138       SOut.PutStr(Lx.GetFullBTagStr());
01139     } else {
01140       SOut.PutStr(Lx.SymChA());
01141     }
01142   }
01143   return SOut.GetAsStr();
01144 }
01145 
01147 // Html-Hyper-Link-Document-Vector
01148 THtmlHldV::THtmlHldV(const PHtmlDoc& _RefHtmlDoc, const int& HldWnLen):
01149   RefHtmlDoc(_RefHtmlDoc), HldV(){
01150   bool IsTitleAct=false; THtmlTokV TitleTokV;
01151   bool IsHAct=false; int ActHTagN=-1;
01152   TVec<THtmlTokV> HTokV(6);
01153   PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr;
01154   for (int TokN=0; TokN<RefHtmlDoc->GetToks(); TokN++){
01155     Tok=RefHtmlDoc->GetTok(TokN, TokSym, TokStr);
01156     if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){
01157       // collect tokens before, inside and after <a> ... </a> tags
01158       int ATokN; PHtmlTok ATok; THtmlLxSym ATokSym; TStr ATokStr;
01159       // inside <A> tags
01160       THtmlTokV ATokV; ATokN=TokN;
01161       forever{
01162         ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr);
01163         if (ATokSym!=hsySSym){ATokV.Add(ATok);}
01164         if ((ATokSym==hsyETag)&&(ATokStr==THtmlTok::ATagNm)){break;}
01165         ATokN++;
01166         if (ATokN>=RefHtmlDoc->GetToks()){break;}
01167       }
01168       int ETagATokN=ATokN+1;
01169       // before <A> tags
01170       THtmlTokV PrevATokV; ATokN=TokN;
01171       forever{
01172         ATokN--;
01173         if (ATokN<0){break;}
01174         ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr);
01175         if (THtmlTok::IsBreakTok(ATok)){break;}
01176         if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){PrevATokV.Add(ATok);}
01177         if (ATokV.Len()>=HldWnLen){break;}
01178       }
01179       // after <A> tags
01180       THtmlTokV NextATokV; ATokN=ETagATokN;
01181       forever{
01182         ATokN++;
01183         if (ATokN>=RefHtmlDoc->GetToks()){break;}
01184         ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr);
01185         if (THtmlTok::IsBreakTok(ATok)){break;}
01186         if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){NextATokV.Add(ATok);}
01187         if (ATokV.Len()>=HldWnLen){break;}
01188       }
01189       // construct html-document with hyper-link context
01190       PHtmlDoc HtmlDoc=PHtmlDoc(new THtmlDoc());
01191       HtmlDoc->AddTokV(TitleTokV);
01192       for (int HTagN=1; HTagN<=6; HTagN++){HtmlDoc->AddTokV(HTokV[HTagN-1]);}
01193       HtmlDoc->AddTokV(PrevATokV);
01194       HtmlDoc->AddTokV(ATokV);
01195       HtmlDoc->AddTokV(NextATokV);
01196       HldV.Add(HtmlDoc);
01197       HtmlDoc->SaveTxt(TSOut::StdOut);
01198     } else
01199     if (TokSym==hsyBTag){
01200       int HTagN;
01201       if (TokStr==THtmlTok::TitleTagNm){
01202         IsTitleAct=true; TitleTokV.Clr(); TitleTokV.Add(Tok);
01203       } else
01204       if (THtmlTok::IsHTag(TokStr, HTagN)){
01205         if (IsHAct){// conclude previous <H?> tag if left open
01206           HTokV[ActHTagN-1].Add(THtmlTok::GetHTok(false, ActHTagN));}
01207         IsHAct=true; ActHTagN=HTagN;
01208         {for (int HTagN=ActHTagN; HTagN<=6; HTagN++){HTokV[HTagN-1].Clr();}}
01209         HTokV[ActHTagN-1].Add(Tok);
01210       }
01211     } else
01212     if (TokSym==hsyETag){
01213       int HTagN;
01214       if (TokStr==THtmlTok::TitleTagNm){
01215         if (IsTitleAct){TitleTokV.Add(Tok); IsTitleAct=false;}
01216       } else
01217       if (THtmlTok::IsHTag(TokStr, HTagN)){
01218         if (IsHAct){HTokV[ActHTagN-1].Add(Tok); IsHAct=false;}
01219       }
01220     } else
01221     if (TokSym!=hsySSym){
01222       if (IsTitleAct){TitleTokV.Add(Tok);}
01223       if (IsHAct){HTokV[ActHTagN-1].Add(Tok);}
01224     }
01225   }
01226 }
01227 
01229 // Web-Page
01230 void TWebPg::GetOutUrlV(TUrlV& OutUrlV, TUrlV& OutRedirUrlV) const {
01231   // create outgoing url vector
01232   OutUrlV.Clr(); OutRedirUrlV.Clr();
01233   // take interesting web-page components
01234   TStr UrlStr=GetUrlStr();
01235   TStr HtmlStr=GetHttpBodyAsStr();
01236   // prepare html parsing
01237   PSIn HtmlSIn=TStrIn::New(HtmlStr);
01238   PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
01239   PHtmlTok Tok;
01240   // traverse html
01241   for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){
01242     PHtmlTok Tok=HtmlDoc->GetTok(TokN);
01243     if (Tok->GetSym()==hsyBTag){
01244       TStr RelUrlStr;
01245       if (Tok->IsUrlTok(RelUrlStr)){
01246         PUrl Url=TUrl::New(RelUrlStr, UrlStr);
01247         if (Url->IsOk(usHttp)){
01248           OutUrlV.Add(Url);
01249           if (Tok->IsRedirUrlTok()){
01250             OutRedirUrlV.Add(Url);
01251           }
01252         }
01253       }
01254     }
01255   }
01256 }
01257 
01258 void TWebPg::GetOutDescUrlStrKdV(TStrKdV& OutDescUrlStrKdV) const {
01259   // create outgoing url vector
01260   OutDescUrlStrKdV.Clr();
01261   // take interesting web-page components
01262   TStr UrlStr=GetUrlStr();
01263   TStr HtmlStr=GetHttpBodyAsStr();
01264   // prepare html parsing
01265   PSIn HtmlSIn=TStrIn::New(HtmlStr);
01266   PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
01267   // traverse html documents
01268   PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr;
01269   int TokN=0; int Toks=HtmlDoc->GetToks();
01270   while (TokN<Toks){
01271     Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
01272     if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){
01273       TStr RelUrlStr;
01274       if (Tok->IsUrlTok(RelUrlStr)){
01275         PUrl Url=TUrl::New(RelUrlStr, UrlStr);
01276         if (Url->IsOk()){
01277           TChA DescChA;
01278           while (TokN<Toks){
01279             Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
01280             if ((TokSym==hsyETag)&&(TokStr==THtmlTok::ATagNm)){
01281               break;
01282             } else {
01283               if ((TokSym==hsyStr)||(TokSym==hsyNum)||(TokSym==hsySSym)){
01284                 if (!DescChA.Empty()){DescChA+=' ';}
01285                 DescChA+=TokStr;
01286               }
01287             }
01288           }
01289           OutDescUrlStrKdV.Add(TStrKd(DescChA, Url->GetUrlStr()));
01290         }
01291       }
01292     }
01293   }
01294 }
01295 
01296 void TWebPg::SaveAsHttpBody(const TStr& FNm) const {
01297   // create output file
01298   PSOut SOut=TFOut::New(FNm);
01299   // save http-body
01300   HttpResp->SaveBody(SOut);
01301 }
01302 
01303 void TWebPg::SaveAsHttp(const TStr& FNm) const {
01304   // create output file
01305   PSOut SOut=TFOut::New(FNm);
01306   // save http
01307   HttpResp->SaveTxt(SOut);
01308 }
01309 
01310 bool TWebPg::IsTxt() const {
01311   if ((!HttpResp->IsContType())||HttpResp->IsContType(THttp::TextFldVal)){
01312     TStr Str=HttpResp->GetBodyAsStr();
01313     int StrLen=Str.Len(); int ChN=0; int PrintChs=0;
01314     while ((ChN<100)&&(ChN<StrLen)){
01315       char Ch=Str[ChN++];
01316       if (((' '<=Ch)&&(Ch<='~'))||(Ch==TCh::TabCh)||(Ch==TCh::LfCh)||(Ch==TCh::CrCh)){
01317         PrintChs++;}
01318     }
01319     double PrintPrb=double(PrintChs)/double(ChN+1);
01320     return PrintPrb>0.9;
01321   } else {
01322     return false;
01323   }
01324 }
01325