|
SNAP Library 2.2, Developer Reference
2014-03-11 19:15:55
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
#include <html.h>

Public Member Functions | |
| THtmlDoc () | |
| THtmlDoc (const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true) | |
| THtmlDoc (TSIn &) | |
| void | Save (TSOut &) |
| THtmlDoc & | operator= (const THtmlDoc &) |
| int | GetToks () const |
| PHtmlTok | GetTok (const int &TokN) const |
| PHtmlTok | GetTok (const int &TokN, THtmlLxSym &Sym, TStr &Str) const |
| void | AddTokV (const THtmlTokV &_TokV) |
| void | SaveTxt (const PSOut &SOut, const bool &TxtMode=true) const |
Static Public Member Functions | |
| static PHtmlDoc | New (const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true) |
| static PHtmlDoc | Load (TSIn &) |
| static TStr | GetTxtLnDoc (const TStr &HtmlStr) |
| static TStr | GetTxtLnDoc (const TStr &HtmlStr, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutTagsP) |
| static PHtmlDoc | LoadTxt (const TStr &FNm, const THtmlDocType &Type=hdtAll, const bool &DoUc=true) |
| static void | SaveHtmlToTxt (const TStr &HtmlStr, const PSOut &TxtSOut, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutToksP) |
| static void | SaveHtmlToTxt (const TStr &HtmlStr, const TStr &TxtFNm, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutToksP) |
| static void | SaveHtmlToXml (const TStr &HtmlStr, const PSOut &XmlSOut, const TStr &BaseUrlStr, const bool &OutTextP, const bool &OutUrlP, const bool &OutToksP, const bool &OutTagsP, const bool &OutArgsP) |
| static void | SaveHtmlToXml (const TStr &HtmlStr, const TStr &XmlFNm, const TStr &BaseUrlStr, const bool &OutTextP, const bool &OutUrlP, const bool &OutToksP, const bool &OutTagsP, const bool &OutArgsP) |
| static TLxSym | GetLxSym (const THtmlLxSym &HtmlLxSym, const TChA &ChA) |
| static bool | _IsTagRedir (const TStr &TagStr, const TStr &ArgNm, THtmlLx &Lx, const TStr &BaseUrlStr, const TStr &RedirUrlStr) |
| static TStr | GetRedirHtmlDocStr (const TStr &HtmlStr, const TStr &BaseUrlStr, const TStr &RedirUrlStr) |
Private Attributes | |
| TCRef | CRef |
| THtmlTokV | TokV |
Friends | |
| class | TPt< THtmlDoc > |
| THtmlDoc::THtmlDoc | ( | ) | [inline] |
| THtmlDoc::THtmlDoc | ( | const PSIn & | SIn, |
| const THtmlDocType & | Type = hdtAll, |
||
| const bool & | DoUc = true |
||
| ) |
Definition at line 779 of file html.cpp.
References TVec< TVal, TSizeTy >::Add(), Fail, THtmlLx::GetSym(), THtmlLx::GetTok(), hdtA, hdtAll, hdtHRef, hdtStr, hdtStrNum, hdtTag, hdtUL, hsyBTag, hsyEof, hsyETag, hsyNum, hsyStr, THtmlLx::Sym, TokV, and THtmlLx::UcChA.
: TokV(1000, 0){ THtmlLx Lx(SIn); bool MkTok=false; bool InUL=false; while (Lx.GetSym()!=hsyEof){ switch (Type){ case hdtAll: MkTok=true; break; case hdtStr: MkTok=(Lx.Sym==hsyStr); break; case hdtStrNum: MkTok=(Lx.Sym==hsyStr)||(Lx.Sym==hsyNum); break; case hdtTag: MkTok=(Lx.Sym==hsyBTag)||(Lx.Sym==hsyETag); break; case hdtA: MkTok=(Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::ATagNm); break; case hdtHRef: MkTok=(Lx.Sym==hsyBTag)&& ((Lx.UcChA==THtmlTok::ATagNm)||(Lx.UcChA==THtmlTok::AreaTagNm)|| (Lx.UcChA==THtmlTok::FrameTagNm)||(Lx.UcChA==THtmlTok::ImgTagNm)|| (Lx.UcChA==THtmlTok::MetaTagNm)); break; case hdtUL: if ((Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=true;} MkTok=InUL; if ((Lx.Sym==hsyETag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=false;} break; default: Fail; } if (MkTok){TokV.Add(Lx.GetTok(DoUc));} } TokV.Add(PHtmlTok(new THtmlTok(hsyEof))); }

| THtmlDoc::THtmlDoc | ( | TSIn & | ) | [inline] |
| bool THtmlDoc::_IsTagRedir | ( | const TStr & | TagStr, |
| const TStr & | ArgNm, | ||
| THtmlLx & | Lx, | ||
| const TStr & | BaseUrlStr, | ||
| const TStr & | RedirUrlStr | ||
| ) | [static] |
Definition at line 1106 of file html.cpp.
References THtmlLx::ChA, THtmlLx::GetArg(), TUrlEnv::GetFullUrlStr(), TUrl::GetUrlStr(), hsyBTag, IAssert, THtmlLx::IsArg(), TUrl::IsOk(), New(), THtmlLx::PutArg(), THtmlLx::Sym, and usHttp.
Referenced by GetRedirHtmlDocStr().
{
IAssert(Lx.Sym==hsyBTag);
if ((Lx.ChA==TagStr)&&(Lx.IsArg(ArgNm))){
TStr RelUrlStr=Lx.GetArg(ArgNm);
PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
if (Url->IsOk(usHttp)){
TStr UrlStr=Url->GetUrlStr();
PUrlEnv RedirUrlEnv=TUrlEnv::New(RedirUrlStr, "url", UrlStr);
Lx.PutArg(ArgNm, RedirUrlEnv->GetFullUrlStr());
return true;
} else {
return false;
}
} else {
return false;
}
}


| void THtmlDoc::AddTokV | ( | const THtmlTokV & | _TokV | ) | [inline] |
Definition at line 274 of file html.h.
Referenced by THtmlHldV::THtmlHldV().

| TLxSym THtmlDoc::GetLxSym | ( | const THtmlLxSym & | HtmlLxSym, |
| const TChA & | ChA | ||
| ) | [static] |
Definition at line 1092 of file html.cpp.
References Fail, TLxSymStr::GetSSym(), hsyBTag, hsyEof, hsyETag, hsyNum, hsySSym, hsyStr, hsyUndef, hsyUrl, syEof, syFlt, syStr, and syUndef.
{
switch (HtmlLxSym){
case hsyUndef: return syUndef;
case hsyStr: return syStr;
case hsyNum: return syFlt;
case hsySSym: return TLxSymStr::GetSSym(ChA);
case hsyUrl: return syStr;
case hsyBTag: return syStr;
case hsyETag: return syStr;
case hsyEof: return syEof;
default: Fail; return syUndef;
}
}

| TStr THtmlDoc::GetRedirHtmlDocStr | ( | const TStr & | HtmlStr, |
| const TStr & | BaseUrlStr, | ||
| const TStr & | RedirUrlStr | ||
| ) | [static] |
Definition at line 1126 of file html.cpp.
References _IsTagRedir(), TMOut::GetAsStr(), THtmlLx::GetFullBTagStr(), THtmlLx::GetSym(), hsyBTag, hsyEof, New(), THtmlLx::PreSpaceChA, TSOut::PutStr(), THtmlLx::Sym, and THtmlLx::SymChA.
{
PSIn SIn=TStrIn::New(HtmlStr);
TMOut SOut;
THtmlLx Lx(SIn);
while (Lx.GetSym()!=hsyEof){
SOut.PutStr(Lx.PreSpaceChA);
if ((Lx.Sym==hsyBTag)&&(
(_IsTagRedir(THtmlTok::ATagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
(_IsTagRedir(THtmlTok::AreaTagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
(_IsTagRedir(THtmlTok::FrameTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr))||
(_IsTagRedir(THtmlTok::ImgTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr)))){
SOut.PutStr(Lx.GetFullBTagStr());
} else {
SOut.PutStr(Lx.SymChA());
}
}
return SOut.GetAsStr();
}

| PHtmlTok THtmlDoc::GetTok | ( | const int & | TokN | ) | const [inline] |
Definition at line 271 of file html.h.
Referenced by TWebPg::GetOutDescUrlStrKdV(), TWebPg::GetOutUrlV(), and THtmlHldV::THtmlHldV().
{return TokV[TokN];}

| PHtmlTok THtmlDoc::GetTok | ( | const int & | TokN, |
| THtmlLxSym & | Sym, | ||
| TStr & | Str | ||
| ) | const [inline] |
| int THtmlDoc::GetToks | ( | ) | const [inline] |
Definition at line 270 of file html.h.
Referenced by TWebPg::GetOutDescUrlStrKdV(), TWebPg::GetOutUrlV(), and THtmlHldV::THtmlHldV().

| TStr THtmlDoc::GetTxtLnDoc | ( | const TStr & | HtmlStr | ) | [static] |
Definition at line 808 of file html.cpp.
References THtmlLx::ChA, TStr::CStr(), TChA::Empty(), THtmlLx::GetSym(), hsyBTag, hsyEof, hsyETag, hsyNum, hsySSym, hsyStr, TChA::LastCh(), New(), THtmlLx::PreSpaces, and THtmlLx::Sym.
Referenced by SaveHtmlToTxt().
{
TChA LnDocChA;
// prepare html parsing
PSIn HtmlSIn=TStrIn::New(HtmlStr);
THtmlLx HtmlLx(HtmlSIn);
bool InScript=false;
// save text
while (HtmlLx.GetSym()!=hsyEof){
TStr Str=HtmlLx.ChA;
switch (HtmlLx.Sym){
case hsyStr:
case hsyNum:
case hsySSym:
if (InScript){break;}
if (HtmlLx.PreSpaces>0){LnDocChA+=' ';}
LnDocChA+=Str.CStr();
break;
case hsyBTag:
if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';}
if ((!InScript)&&(Str=="<SCRIPT>")){InScript=true;}
break;
case hsyETag:
if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';}
if ((InScript)&&(Str=="<SCRIPT>")){InScript=false;}
break;
default: break;
}
}
// return result
return LnDocChA;
}


| TStr THtmlDoc::GetTxtLnDoc | ( | const TStr & | HtmlStr, |
| const TStr & | BaseUrlStr, | ||
| const bool & | OutUrlP, | ||
| const bool & | OutTagsP | ||
| ) | [static] |
Definition at line 840 of file html.cpp.
References THtmlLx::ChA, Fail, TStr::GetSubStr(), THtmlLx::GetSym(), THtmlLx::GetTok(), TUrl::GetUrlStr(), TXmlLx::GetXmlStrFromPlainStr(), hsyBTag, hsyEof, hsyETag, hsyMTag, hsyNum, hsySSym, hsyStr, hsyUndef, hsyUrl, TUrl::IsOk(), TChA::LastCh(), TStr::Len(), New(), THtmlLx::PreSpaces, and THtmlLx::Sym.
{
// prepare output-string
TChA OutChA; OutChA+=' ';
// prepare html parsing
PSIn HtmlSIn=TStrIn::New(HtmlStr);
THtmlLx HtmlLx(HtmlSIn);
bool InScript=false;
// save text
while (HtmlLx.GetSym()!=hsyEof){
TStr Str=HtmlLx.ChA;
switch (HtmlLx.Sym){
case hsyUndef:
case hsyUrl:
case hsyMTag:
break;
case hsyStr:
case hsyNum:
case hsySSym:
if (InScript){break;}
if (HtmlLx.PreSpaces>0){if (OutChA.LastCh()!=' '){OutChA+=' ';}}
OutChA+=Str;
break;
case hsyBTag:
// extract tag name
Str=Str.GetSubStr(1, Str.Len()-2);
// process tag
if (!InScript){
// check script tag
if (Str=="SCRIPT"){
InScript=true; break;}
// output tag
if (OutTagsP){
OutChA+='<'; OutChA+=Str; OutChA+='>';
} else {
if (OutChA.LastCh()!=' '){OutChA+=' ';}
}
// check if URL present
PHtmlTok Tok=HtmlLx.GetTok();
TStr RelUrlStr;
if (Tok->IsUrlTok(RelUrlStr)){
PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
if (Url->IsOk()){
if (OutUrlP){
TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr());
OutChA+="<Url>"; OutChA+=XmlUrlStr; OutChA+="</Url>";
}
}
}
}
break;
case hsyETag:
// extract tag name
Str=Str.GetSubStr(1, Str.Len()-2);
// process tag
if (InScript){
if (Str=="SCRIPT"){
InScript=false; break;}
} else {
if (OutTagsP){
OutChA+="</"; OutChA+=Str; OutChA+='>';
} else {
if (OutChA.LastCh()!=' '){OutChA+=' ';}
}
}
break;
case hsyEof: break;
default: Fail;
}
}
// return string
return OutChA;
}

| static PHtmlDoc THtmlDoc::Load | ( | TSIn & | ) | [inline, static] |
| static PHtmlDoc THtmlDoc::LoadTxt | ( | const TStr & | FNm, |
| const THtmlDocType & | Type = hdtAll, |
||
| const bool & | DoUc = true |
||
| ) | [inline, static] |
| static PHtmlDoc THtmlDoc::New | ( | const PSIn & | SIn, |
| const THtmlDocType & | Type = hdtAll, |
||
| const bool & | DoUc = true |
||
| ) | [inline, static] |
Definition at line 261 of file html.h.
Referenced by _IsTagRedir(), GetRedirHtmlDocStr(), GetTxtLnDoc(), SaveHtmlToTxt(), and SaveHtmlToXml().

| void THtmlDoc::Save | ( | TSOut & | ) | [inline] |
| void THtmlDoc::SaveHtmlToTxt | ( | const TStr & | HtmlStr, |
| const PSOut & | TxtSOut, | ||
| const TStr & | BaseUrlStr, | ||
| const bool & | OutUrlP, | ||
| const bool & | OutToksP | ||
| ) | [static] |
Definition at line 928 of file html.cpp.
References GetTxtLnDoc(), and TStr::SaveTxt().
Referenced by SaveHtmlToTxt().
{
// get text-string from html-string
TStr TxtStr=GetTxtLnDoc(HtmlStr, BaseUrlStr, OutUrlP, OutTagsP);
// save text-string
TxtStr.SaveTxt(TxtSOut);
}


| void THtmlDoc::SaveHtmlToTxt | ( | const TStr & | HtmlStr, |
| const TStr & | TxtFNm, | ||
| const TStr & | BaseUrlStr, | ||
| const bool & | OutUrlP, | ||
| const bool & | OutToksP | ||
| ) | [static] |
Definition at line 937 of file html.cpp.
References New(), and SaveHtmlToTxt().
{
// create output file
PSOut TxtSOut=TFOut::New(TxtFNm);
// save to output file
SaveHtmlToTxt(HtmlStr, TxtSOut, BaseUrlStr, OutUrlP, OutTagsP);
}

| void THtmlDoc::SaveHtmlToXml | ( | const TStr & | HtmlStr, |
| const PSOut & | XmlSOut, | ||
| const TStr & | BaseUrlStr, | ||
| const bool & | OutTextP, | ||
| const bool & | OutUrlP, | ||
| const bool & | OutToksP, | ||
| const bool & | OutTagsP, | ||
| const bool & | OutArgsP | ||
| ) | [static] |
Definition at line 946 of file html.cpp.
References TVec< TVal, TSizeTy >::Add(), THtmlLx::ChA, TChA::Clr(), TChA::CStr(), TStr::CStr(), TChA::Empty(), TStr::Empty(), Fail, THtmlLx::GetArgNm(), THtmlLx::GetArgs(), THtmlLx::GetArgVal(), TSOut::GetFileId(), TStr::GetSubStr(), THtmlLx::GetSym(), THtmlLx::GetTok(), TUrl::GetUrlStr(), TXmlLx::GetXmlStrFromPlainStr(), hsyBTag, hsyEof, hsyETag, hsyMTag, hsyNum, hsySSym, hsyStr, hsyUndef, hsyUrl, TUrl::IsOk(), TStr::Len(), TVec< TVal, TSizeTy >::Len(), New(), and THtmlLx::Sym.
Referenced by SaveHtmlToXml().
{
// prepare output-file-id
TFileId fXml=XmlSOut->GetFileId();
// create outgoing url
TStrV OutUrlStrV;
// open top tag
fprintf(fXml, "<HtmlDoc>\n");
// save url
if (!BaseUrlStr.Empty()){
TStr XmlBaseUrlStr=TXmlLx::GetXmlStrFromPlainStr(BaseUrlStr);
fprintf(fXml, "<BaseUrl>%s</BaseUrl>\n", XmlBaseUrlStr.CStr());
}
// prepare html parsing
PSIn HtmlSIn=TStrIn::New(HtmlStr);
THtmlLx HtmlLx(HtmlSIn);
TChA ContTextChA; bool InScript=false;
// save text
fprintf(fXml, "<Body>\n");
while (HtmlLx.GetSym()!=hsyEof){
TStr Str=HtmlLx.ChA;
switch (HtmlLx.Sym){
case hsyUndef:
case hsyUrl:
case hsyMTag:
break;
case hsyStr:
if (InScript){break;}
Str=TXmlLx::GetXmlStrFromPlainStr(Str);
if (OutToksP){
fprintf(fXml, " <Str>%s</Str>\n", Str.CStr());}
if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
break;
case hsyNum:
if (InScript){break;}
Str=TXmlLx::GetXmlStrFromPlainStr(Str);
if (OutToksP){
fprintf(fXml, " <Num>%s</Num>\n", Str.CStr());}
if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
break;
case hsySSym:
if (InScript){break;}
Str=TXmlLx::GetXmlStrFromPlainStr(Str);
if (OutToksP){
fprintf(fXml, " <Sym>%s</Sym>\n", Str.CStr());}
if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
break;
case hsyBTag:{
// save continuos text
if (!ContTextChA.Empty()){
if (OutTextP){
fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());}
ContTextChA.Clr();
}
// extract tag name
Str=Str.GetSubStr(1, Str.Len()-2);
Str=TXmlLx::GetXmlStrFromPlainStr(Str);
// process tag
if (!InScript){
// check script tag
if (Str=="SCRIPT"){
InScript=true; break;}
// output tag
if (OutTagsP){
if (OutArgsP){
fprintf(fXml, " <BTag Nm=\"%s\">\n", Str.CStr());
for (int ArgN=0; ArgN<HtmlLx.GetArgs(); ArgN++){
TStr ArgNm=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgNm(ArgN));
TStr ArgVal=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgVal(ArgN));
fprintf(fXml, " <Arg Nm=\"%s\" Val=\"%s\"/>", ArgNm.CStr(), ArgVal.CStr());
}
fprintf(fXml, " </BTag>\n");
} else {
fprintf(fXml, " <BTag Nm=\"%s\"/>\n", Str.CStr());
}
}
// check if URL present
PHtmlTok Tok=HtmlLx.GetTok();
TStr RelUrlStr;
if (Tok->IsUrlTok(RelUrlStr)){
PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
if (Url->IsOk()){
OutUrlStrV.Add(Url->GetUrlStr());
if (OutUrlP){
TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr());
fprintf(fXml, " <Url>%s</Url>\n", XmlUrlStr.CStr());
}
}
}
}
break;}
case hsyETag:{
// save continuos text
if (!ContTextChA.Empty()){
if (OutTextP){
fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());}
ContTextChA.Clr();
}
// extract tag name
Str=Str.GetSubStr(1, Str.Len()-2);
Str=TXmlLx::GetXmlStrFromPlainStr(Str);
// process tag
if (InScript){
if (Str=="SCRIPT"){
InScript=false; break;}
} else {
if (OutTagsP){
fprintf(fXml, " <ETag Nm=\"%s\"/>\n", Str.CStr());}
}
break;}
case hsyEof: break;
default: Fail;
}
}
// save continuos text
if (!ContTextChA.Empty()){
if (OutTextP){
fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());}
ContTextChA.Clr();
}
fprintf(fXml, "</Body>\n");
// save outgoing urls
fprintf(fXml, "<OutUrls>\n");
for (int UrlN=0; UrlN<OutUrlStrV.Len(); UrlN++){
TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(OutUrlStrV[UrlN]);
fprintf(fXml, " <Url N=\"%d\">%s</Url>\n", 1+UrlN, XmlUrlStr.CStr());
}
fprintf(fXml, "</OutUrls>\n");
// close top tag
fprintf(fXml, "</HtmlDoc>\n");
}


| void THtmlDoc::SaveHtmlToXml | ( | const TStr & | HtmlStr, |
| const TStr & | XmlFNm, | ||
| const TStr & | BaseUrlStr, | ||
| const bool & | OutTextP, | ||
| const bool & | OutUrlP, | ||
| const bool & | OutToksP, | ||
| const bool & | OutTagsP, | ||
| const bool & | OutArgsP | ||
| ) | [static] |
Definition at line 1081 of file html.cpp.
References New(), and SaveHtmlToXml().
{
// create output file
PSOut XmlSOut=TFOut::New(XmlFNm);
// save to output file
SaveHtmlToXml(HtmlStr, XmlSOut, BaseUrlStr, OutTextP, OutUrlP,
OutToksP, OutTagsP, OutArgsP);
}

| void THtmlDoc::SaveTxt | ( | const PSOut & | SOut, |
| const bool & | TxtMode = true |
||
| ) | const |
Definition at line 915 of file html.cpp.
References TInt::GetStr(), TVec< TVal, TSizeTy >::Len(), TSOut::PutLn(), TSOut::PutStr(), and TokV.
Referenced by THtmlHldV::THtmlHldV().
{
if (TxtMode){
for (int TokN=0; TokN<TokV.Len(); TokN++){TokV[TokN]->SaveTxt(SOut);}
SOut->PutLn();
} else {
for (int TokN=0; TokN<TokV.Len(); TokN++){
SOut->PutStr(TInt::GetStr(TokN)); SOut->PutStr(": ");
TokV[TokN]->SaveTxt(SOut);
SOut->PutLn();
}
}
}


TCRef THtmlDoc::CRef [private] |
THtmlTokV THtmlDoc::TokV [private] |
Definition at line 256 of file html.h.
Referenced by SaveTxt(), and THtmlDoc().