SNAP Library 2.2, Developer Reference
2014-03-11 19:15:55
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
#include <html.h>
Public Member Functions | |
THtmlDoc () | |
THtmlDoc (const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true) | |
THtmlDoc (TSIn &) | |
void | Save (TSOut &) |
THtmlDoc & | operator= (const THtmlDoc &) |
int | GetToks () const |
PHtmlTok | GetTok (const int &TokN) const |
PHtmlTok | GetTok (const int &TokN, THtmlLxSym &Sym, TStr &Str) const |
void | AddTokV (const THtmlTokV &_TokV) |
void | SaveTxt (const PSOut &SOut, const bool &TxtMode=true) const |
Static Public Member Functions | |
static PHtmlDoc | New (const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true) |
static PHtmlDoc | Load (TSIn &) |
static TStr | GetTxtLnDoc (const TStr &HtmlStr) |
static TStr | GetTxtLnDoc (const TStr &HtmlStr, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutTagsP) |
static PHtmlDoc | LoadTxt (const TStr &FNm, const THtmlDocType &Type=hdtAll, const bool &DoUc=true) |
static void | SaveHtmlToTxt (const TStr &HtmlStr, const PSOut &TxtSOut, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutToksP) |
static void | SaveHtmlToTxt (const TStr &HtmlStr, const TStr &TxtFNm, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutToksP) |
static void | SaveHtmlToXml (const TStr &HtmlStr, const PSOut &XmlSOut, const TStr &BaseUrlStr, const bool &OutTextP, const bool &OutUrlP, const bool &OutToksP, const bool &OutTagsP, const bool &OutArgsP) |
static void | SaveHtmlToXml (const TStr &HtmlStr, const TStr &XmlFNm, const TStr &BaseUrlStr, const bool &OutTextP, const bool &OutUrlP, const bool &OutToksP, const bool &OutTagsP, const bool &OutArgsP) |
static TLxSym | GetLxSym (const THtmlLxSym &HtmlLxSym, const TChA &ChA) |
static bool | _IsTagRedir (const TStr &TagStr, const TStr &ArgNm, THtmlLx &Lx, const TStr &BaseUrlStr, const TStr &RedirUrlStr) |
static TStr | GetRedirHtmlDocStr (const TStr &HtmlStr, const TStr &BaseUrlStr, const TStr &RedirUrlStr) |
Private Attributes | |
TCRef | CRef |
THtmlTokV | TokV |
Friends | |
class | TPt< THtmlDoc > |
THtmlDoc::THtmlDoc | ( | ) | [inline] |
THtmlDoc::THtmlDoc | ( | const PSIn & | SIn, |
const THtmlDocType & | Type = hdtAll , |
||
const bool & | DoUc = true |
||
) |
Definition at line 779 of file html.cpp.
References TVec< TVal, TSizeTy >::Add(), Fail, THtmlLx::GetSym(), THtmlLx::GetTok(), hdtA, hdtAll, hdtHRef, hdtStr, hdtStrNum, hdtTag, hdtUL, hsyBTag, hsyEof, hsyETag, hsyNum, hsyStr, THtmlLx::Sym, TokV, and THtmlLx::UcChA.
: TokV(1000, 0){ THtmlLx Lx(SIn); bool MkTok=false; bool InUL=false; while (Lx.GetSym()!=hsyEof){ switch (Type){ case hdtAll: MkTok=true; break; case hdtStr: MkTok=(Lx.Sym==hsyStr); break; case hdtStrNum: MkTok=(Lx.Sym==hsyStr)||(Lx.Sym==hsyNum); break; case hdtTag: MkTok=(Lx.Sym==hsyBTag)||(Lx.Sym==hsyETag); break; case hdtA: MkTok=(Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::ATagNm); break; case hdtHRef: MkTok=(Lx.Sym==hsyBTag)&& ((Lx.UcChA==THtmlTok::ATagNm)||(Lx.UcChA==THtmlTok::AreaTagNm)|| (Lx.UcChA==THtmlTok::FrameTagNm)||(Lx.UcChA==THtmlTok::ImgTagNm)|| (Lx.UcChA==THtmlTok::MetaTagNm)); break; case hdtUL: if ((Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=true;} MkTok=InUL; if ((Lx.Sym==hsyETag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=false;} break; default: Fail; } if (MkTok){TokV.Add(Lx.GetTok(DoUc));} } TokV.Add(PHtmlTok(new THtmlTok(hsyEof))); }
THtmlDoc::THtmlDoc | ( | TSIn & | ) | [inline] |
bool THtmlDoc::_IsTagRedir | ( | const TStr & | TagStr, |
const TStr & | ArgNm, | ||
THtmlLx & | Lx, | ||
const TStr & | BaseUrlStr, | ||
const TStr & | RedirUrlStr | ||
) | [static] |
Definition at line 1106 of file html.cpp.
References THtmlLx::ChA, THtmlLx::GetArg(), TUrlEnv::GetFullUrlStr(), TUrl::GetUrlStr(), hsyBTag, IAssert, THtmlLx::IsArg(), TUrl::IsOk(), New(), THtmlLx::PutArg(), THtmlLx::Sym, and usHttp.
Referenced by GetRedirHtmlDocStr().
{ IAssert(Lx.Sym==hsyBTag); if ((Lx.ChA==TagStr)&&(Lx.IsArg(ArgNm))){ TStr RelUrlStr=Lx.GetArg(ArgNm); PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr); if (Url->IsOk(usHttp)){ TStr UrlStr=Url->GetUrlStr(); PUrlEnv RedirUrlEnv=TUrlEnv::New(RedirUrlStr, "url", UrlStr); Lx.PutArg(ArgNm, RedirUrlEnv->GetFullUrlStr()); return true; } else { return false; } } else { return false; } }
void THtmlDoc::AddTokV | ( | const THtmlTokV & | _TokV | ) | [inline] |
Definition at line 274 of file html.h.
Referenced by THtmlHldV::THtmlHldV().
TLxSym THtmlDoc::GetLxSym | ( | const THtmlLxSym & | HtmlLxSym, |
const TChA & | ChA | ||
) | [static] |
Definition at line 1092 of file html.cpp.
References Fail, TLxSymStr::GetSSym(), hsyBTag, hsyEof, hsyETag, hsyNum, hsySSym, hsyStr, hsyUndef, hsyUrl, syEof, syFlt, syStr, and syUndef.
{ switch (HtmlLxSym){ case hsyUndef: return syUndef; case hsyStr: return syStr; case hsyNum: return syFlt; case hsySSym: return TLxSymStr::GetSSym(ChA); case hsyUrl: return syStr; case hsyBTag: return syStr; case hsyETag: return syStr; case hsyEof: return syEof; default: Fail; return syUndef; } }
TStr THtmlDoc::GetRedirHtmlDocStr | ( | const TStr & | HtmlStr, |
const TStr & | BaseUrlStr, | ||
const TStr & | RedirUrlStr | ||
) | [static] |
Definition at line 1126 of file html.cpp.
References _IsTagRedir(), TMOut::GetAsStr(), THtmlLx::GetFullBTagStr(), THtmlLx::GetSym(), hsyBTag, hsyEof, New(), THtmlLx::PreSpaceChA, TSOut::PutStr(), THtmlLx::Sym, and THtmlLx::SymChA.
{ PSIn SIn=TStrIn::New(HtmlStr); TMOut SOut; THtmlLx Lx(SIn); while (Lx.GetSym()!=hsyEof){ SOut.PutStr(Lx.PreSpaceChA); if ((Lx.Sym==hsyBTag)&&( (_IsTagRedir(THtmlTok::ATagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))|| (_IsTagRedir(THtmlTok::AreaTagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))|| (_IsTagRedir(THtmlTok::FrameTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr))|| (_IsTagRedir(THtmlTok::ImgTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr)))){ SOut.PutStr(Lx.GetFullBTagStr()); } else { SOut.PutStr(Lx.SymChA()); } } return SOut.GetAsStr(); }
PHtmlTok THtmlDoc::GetTok | ( | const int & | TokN | ) | const [inline] |
Definition at line 271 of file html.h.
Referenced by TWebPg::GetOutDescUrlStrKdV(), TWebPg::GetOutUrlV(), and THtmlHldV::THtmlHldV().
{return TokV[TokN];}
PHtmlTok THtmlDoc::GetTok | ( | const int & | TokN, |
THtmlLxSym & | Sym, | ||
TStr & | Str | ||
) | const [inline] |
int THtmlDoc::GetToks | ( | ) | const [inline] |
Definition at line 270 of file html.h.
Referenced by TWebPg::GetOutDescUrlStrKdV(), TWebPg::GetOutUrlV(), and THtmlHldV::THtmlHldV().
TStr THtmlDoc::GetTxtLnDoc | ( | const TStr & | HtmlStr | ) | [static] |
Definition at line 808 of file html.cpp.
References THtmlLx::ChA, TStr::CStr(), TChA::Empty(), THtmlLx::GetSym(), hsyBTag, hsyEof, hsyETag, hsyNum, hsySSym, hsyStr, TChA::LastCh(), New(), THtmlLx::PreSpaces, and THtmlLx::Sym.
Referenced by SaveHtmlToTxt().
{ TChA LnDocChA; // prepare html parsing PSIn HtmlSIn=TStrIn::New(HtmlStr); THtmlLx HtmlLx(HtmlSIn); bool InScript=false; // save text while (HtmlLx.GetSym()!=hsyEof){ TStr Str=HtmlLx.ChA; switch (HtmlLx.Sym){ case hsyStr: case hsyNum: case hsySSym: if (InScript){break;} if (HtmlLx.PreSpaces>0){LnDocChA+=' ';} LnDocChA+=Str.CStr(); break; case hsyBTag: if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';} if ((!InScript)&&(Str=="<SCRIPT>")){InScript=true;} break; case hsyETag: if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';} if ((InScript)&&(Str=="<SCRIPT>")){InScript=false;} break; default: break; } } // return result return LnDocChA; }
TStr THtmlDoc::GetTxtLnDoc | ( | const TStr & | HtmlStr, |
const TStr & | BaseUrlStr, | ||
const bool & | OutUrlP, | ||
const bool & | OutTagsP | ||
) | [static] |
Definition at line 840 of file html.cpp.
References THtmlLx::ChA, Fail, TStr::GetSubStr(), THtmlLx::GetSym(), THtmlLx::GetTok(), TUrl::GetUrlStr(), TXmlLx::GetXmlStrFromPlainStr(), hsyBTag, hsyEof, hsyETag, hsyMTag, hsyNum, hsySSym, hsyStr, hsyUndef, hsyUrl, TUrl::IsOk(), TChA::LastCh(), TStr::Len(), New(), THtmlLx::PreSpaces, and THtmlLx::Sym.
{ // prepare output-string TChA OutChA; OutChA+=' '; // prepare html parsing PSIn HtmlSIn=TStrIn::New(HtmlStr); THtmlLx HtmlLx(HtmlSIn); bool InScript=false; // save text while (HtmlLx.GetSym()!=hsyEof){ TStr Str=HtmlLx.ChA; switch (HtmlLx.Sym){ case hsyUndef: case hsyUrl: case hsyMTag: break; case hsyStr: case hsyNum: case hsySSym: if (InScript){break;} if (HtmlLx.PreSpaces>0){if (OutChA.LastCh()!=' '){OutChA+=' ';}} OutChA+=Str; break; case hsyBTag: // extract tag name Str=Str.GetSubStr(1, Str.Len()-2); // process tag if (!InScript){ // check script tag if (Str=="SCRIPT"){ InScript=true; break;} // output tag if (OutTagsP){ OutChA+='<'; OutChA+=Str; OutChA+='>'; } else { if (OutChA.LastCh()!=' '){OutChA+=' ';} } // check if URL present PHtmlTok Tok=HtmlLx.GetTok(); TStr RelUrlStr; if (Tok->IsUrlTok(RelUrlStr)){ PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr); if (Url->IsOk()){ if (OutUrlP){ TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr()); OutChA+="<Url>"; OutChA+=XmlUrlStr; OutChA+="</Url>"; } } } } break; case hsyETag: // extract tag name Str=Str.GetSubStr(1, Str.Len()-2); // process tag if (InScript){ if (Str=="SCRIPT"){ InScript=false; break;} } else { if (OutTagsP){ OutChA+="</"; OutChA+=Str; OutChA+='>'; } else { if (OutChA.LastCh()!=' '){OutChA+=' ';} } } break; case hsyEof: break; default: Fail; } } // return string return OutChA; }
static PHtmlDoc THtmlDoc::Load | ( | TSIn & | ) | [inline, static] |
static PHtmlDoc THtmlDoc::LoadTxt | ( | const TStr & | FNm, |
const THtmlDocType & | Type = hdtAll , |
||
const bool & | DoUc = true |
||
) | [inline, static] |
static PHtmlDoc THtmlDoc::New | ( | const PSIn & | SIn, |
const THtmlDocType & | Type = hdtAll , |
||
const bool & | DoUc = true |
||
) | [inline, static] |
Definition at line 261 of file html.h.
Referenced by _IsTagRedir(), GetRedirHtmlDocStr(), GetTxtLnDoc(), SaveHtmlToTxt(), and SaveHtmlToXml().
void THtmlDoc::Save | ( | TSOut & | ) | [inline] |
void THtmlDoc::SaveHtmlToTxt | ( | const TStr & | HtmlStr, |
const PSOut & | TxtSOut, | ||
const TStr & | BaseUrlStr, | ||
const bool & | OutUrlP, | ||
const bool & | OutToksP | ||
) | [static] |
Definition at line 928 of file html.cpp.
References GetTxtLnDoc(), and TStr::SaveTxt().
Referenced by SaveHtmlToTxt().
{ // get text-string from html-string TStr TxtStr=GetTxtLnDoc(HtmlStr, BaseUrlStr, OutUrlP, OutTagsP); // save text-string TxtStr.SaveTxt(TxtSOut); }
void THtmlDoc::SaveHtmlToTxt | ( | const TStr & | HtmlStr, |
const TStr & | TxtFNm, | ||
const TStr & | BaseUrlStr, | ||
const bool & | OutUrlP, | ||
const bool & | OutToksP | ||
) | [static] |
Definition at line 937 of file html.cpp.
References New(), and SaveHtmlToTxt().
{ // create output file PSOut TxtSOut=TFOut::New(TxtFNm); // save to output file SaveHtmlToTxt(HtmlStr, TxtSOut, BaseUrlStr, OutUrlP, OutTagsP); }
void THtmlDoc::SaveHtmlToXml | ( | const TStr & | HtmlStr, |
const PSOut & | XmlSOut, | ||
const TStr & | BaseUrlStr, | ||
const bool & | OutTextP, | ||
const bool & | OutUrlP, | ||
const bool & | OutToksP, | ||
const bool & | OutTagsP, | ||
const bool & | OutArgsP | ||
) | [static] |
Definition at line 946 of file html.cpp.
References TVec< TVal, TSizeTy >::Add(), THtmlLx::ChA, TChA::Clr(), TChA::CStr(), TStr::CStr(), TChA::Empty(), TStr::Empty(), Fail, THtmlLx::GetArgNm(), THtmlLx::GetArgs(), THtmlLx::GetArgVal(), TSOut::GetFileId(), TStr::GetSubStr(), THtmlLx::GetSym(), THtmlLx::GetTok(), TUrl::GetUrlStr(), TXmlLx::GetXmlStrFromPlainStr(), hsyBTag, hsyEof, hsyETag, hsyMTag, hsyNum, hsySSym, hsyStr, hsyUndef, hsyUrl, TUrl::IsOk(), TStr::Len(), TVec< TVal, TSizeTy >::Len(), New(), and THtmlLx::Sym.
Referenced by SaveHtmlToXml().
{ // prepare output-file-id TFileId fXml=XmlSOut->GetFileId(); // create outgoing url TStrV OutUrlStrV; // open top tag fprintf(fXml, "<HtmlDoc>\n"); // save url if (!BaseUrlStr.Empty()){ TStr XmlBaseUrlStr=TXmlLx::GetXmlStrFromPlainStr(BaseUrlStr); fprintf(fXml, "<BaseUrl>%s</BaseUrl>\n", XmlBaseUrlStr.CStr()); } // prepare html parsing PSIn HtmlSIn=TStrIn::New(HtmlStr); THtmlLx HtmlLx(HtmlSIn); TChA ContTextChA; bool InScript=false; // save text fprintf(fXml, "<Body>\n"); while (HtmlLx.GetSym()!=hsyEof){ TStr Str=HtmlLx.ChA; switch (HtmlLx.Sym){ case hsyUndef: case hsyUrl: case hsyMTag: break; case hsyStr: if (InScript){break;} Str=TXmlLx::GetXmlStrFromPlainStr(Str); if (OutToksP){ fprintf(fXml, " <Str>%s</Str>\n", Str.CStr());} if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str; break; case hsyNum: if (InScript){break;} Str=TXmlLx::GetXmlStrFromPlainStr(Str); if (OutToksP){ fprintf(fXml, " <Num>%s</Num>\n", Str.CStr());} if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str; break; case hsySSym: if (InScript){break;} Str=TXmlLx::GetXmlStrFromPlainStr(Str); if (OutToksP){ fprintf(fXml, " <Sym>%s</Sym>\n", Str.CStr());} if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str; break; case hsyBTag:{ // save continuos text if (!ContTextChA.Empty()){ if (OutTextP){ fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());} ContTextChA.Clr(); } // extract tag name Str=Str.GetSubStr(1, Str.Len()-2); Str=TXmlLx::GetXmlStrFromPlainStr(Str); // process tag if (!InScript){ // check script tag if (Str=="SCRIPT"){ InScript=true; break;} // output tag if (OutTagsP){ if (OutArgsP){ fprintf(fXml, " <BTag Nm=\"%s\">\n", Str.CStr()); for (int ArgN=0; ArgN<HtmlLx.GetArgs(); ArgN++){ TStr ArgNm=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgNm(ArgN)); TStr ArgVal=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgVal(ArgN)); fprintf(fXml, " <Arg Nm=\"%s\" Val=\"%s\"/>", ArgNm.CStr(), ArgVal.CStr()); } fprintf(fXml, " </BTag>\n"); } else { fprintf(fXml, " <BTag Nm=\"%s\"/>\n", Str.CStr()); } } // check if URL present PHtmlTok Tok=HtmlLx.GetTok(); TStr RelUrlStr; if (Tok->IsUrlTok(RelUrlStr)){ PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr); if (Url->IsOk()){ OutUrlStrV.Add(Url->GetUrlStr()); if (OutUrlP){ TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr()); fprintf(fXml, " <Url>%s</Url>\n", XmlUrlStr.CStr()); } } } } break;} case hsyETag:{ // save continuos text if (!ContTextChA.Empty()){ if (OutTextP){ fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());} ContTextChA.Clr(); } // extract tag name Str=Str.GetSubStr(1, Str.Len()-2); Str=TXmlLx::GetXmlStrFromPlainStr(Str); // process tag if (InScript){ if (Str=="SCRIPT"){ InScript=false; break;} } else { if (OutTagsP){ fprintf(fXml, " <ETag Nm=\"%s\"/>\n", Str.CStr());} } break;} case hsyEof: break; default: Fail; } } // save continuos text if (!ContTextChA.Empty()){ if (OutTextP){ fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());} ContTextChA.Clr(); } fprintf(fXml, "</Body>\n"); // save outgoing urls fprintf(fXml, "<OutUrls>\n"); for (int UrlN=0; UrlN<OutUrlStrV.Len(); UrlN++){ TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(OutUrlStrV[UrlN]); fprintf(fXml, " <Url N=\"%d\">%s</Url>\n", 1+UrlN, XmlUrlStr.CStr()); } fprintf(fXml, "</OutUrls>\n"); // close top tag fprintf(fXml, "</HtmlDoc>\n"); }
void THtmlDoc::SaveHtmlToXml | ( | const TStr & | HtmlStr, |
const TStr & | XmlFNm, | ||
const TStr & | BaseUrlStr, | ||
const bool & | OutTextP, | ||
const bool & | OutUrlP, | ||
const bool & | OutToksP, | ||
const bool & | OutTagsP, | ||
const bool & | OutArgsP | ||
) | [static] |
Definition at line 1081 of file html.cpp.
References New(), and SaveHtmlToXml().
{ // create output file PSOut XmlSOut=TFOut::New(XmlFNm); // save to output file SaveHtmlToXml(HtmlStr, XmlSOut, BaseUrlStr, OutTextP, OutUrlP, OutToksP, OutTagsP, OutArgsP); }
void THtmlDoc::SaveTxt | ( | const PSOut & | SOut, |
const bool & | TxtMode = true |
||
) | const |
Definition at line 915 of file html.cpp.
References TInt::GetStr(), TVec< TVal, TSizeTy >::Len(), TSOut::PutLn(), TSOut::PutStr(), and TokV.
Referenced by THtmlHldV::THtmlHldV().
{ if (TxtMode){ for (int TokN=0; TokN<TokV.Len(); TokN++){TokV[TokN]->SaveTxt(SOut);} SOut->PutLn(); } else { for (int TokN=0; TokN<TokV.Len(); TokN++){ SOut->PutStr(TInt::GetStr(TokN)); SOut->PutStr(": "); TokV[TokN]->SaveTxt(SOut); SOut->PutLn(); } } }
TCRef THtmlDoc::CRef [private] |
THtmlTokV THtmlDoc::TokV [private] |
Definition at line 256 of file html.h.
Referenced by SaveTxt(), and THtmlDoc().