|
SNAP Library 2.0, Developer Reference
2013-05-13 16:33:57
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
#include <html.h>

Public Member Functions | |
| TWebPg () | |
| TWebPg (const TStrV &_UrlStrV, const TStrV &_IpNumV, const PHttpResp &_HttpResp) | |
| ~TWebPg () | |
| TWebPg (TSIn &) | |
| void | Save (TSOut &) |
| TWebPg & | operator= (const TWebPg &) |
| int | GetUrls () const |
| TStr | GetUrlStr (const int &UrlN=-1) const |
| PUrl | GetUrl (const int &UrlN=-1) const |
| int | GetIps () const |
| TStr | GetIpNum (const int &IpN=-1) const |
| PHttpResp | GetHttpResp () const |
| TStr | GetHttpHdStr () const |
| TStr | GetHttpBodyAsStr () const |
| void | GetOutUrlV (TUrlV &OutUrlV, TUrlV &OutRedirUrlV) const |
| void | GetOutUrlV (TUrlV &OutUrlV) const |
| void | GetOutDescUrlStrKdV (TStrKdV &OutDescUrlStrKdV) const |
| void | PutFetchMSecs (const uint64 &_FetchMSecs) |
| uint64 | GetFetchMSecs () const |
| void | SaveAsHttpBody (const TStr &FNm) const |
| void | SaveAsHttp (const TStr &FNm) const |
| bool | IsTxt () const |
Static Public Member Functions | |
| static PWebPg | New (const TStrV &UrlStrV, const TStrV &IpNumV, const PHttpResp &HttpResp) |
| static PWebPg | New (const TStrV &UrlStrV, const PHttpResp &HttpResp) |
| static PWebPg | New (const TStr &UrlStr, const PHttpResp &HttpResp) |
| static PWebPg | Load (TSIn &) |
Private Attributes | |
| TCRef | CRef |
| TStrV | UrlStrV |
| TStrV | IpNumV |
| PHttpResp | HttpResp |
| uint64 | FetchMSecs |
Friends | |
| class | TPt< TWebPg > |
| TWebPg::TWebPg | ( | ) | [inline] |
| TWebPg::TWebPg | ( | const TStrV & | _UrlStrV, |
| const TStrV & | _IpNumV, | ||
| const PHttpResp & | _HttpResp | ||
| ) | [inline] |
| TWebPg::~TWebPg | ( | ) | [inline] |
| uint64 TWebPg::GetFetchMSecs | ( | ) | const [inline] |
Definition at line 377 of file html.h.
{return FetchMSecs;}
| TStr TWebPg::GetHttpBodyAsStr | ( | ) | const [inline] |
Definition at line 368 of file html.h.
Referenced by GetOutDescUrlStrKdV(), and GetOutUrlV().
{return GetHttpResp()->GetBodyAsStr();}

| TStr TWebPg::GetHttpHdStr | ( | ) | const [inline] |
Definition at line 367 of file html.h.
{return GetHttpResp()->GetHdStr();}
| PHttpResp TWebPg::GetHttpResp | ( | ) | const [inline] |
| TStr TWebPg::GetIpNum | ( | const int & | IpN = -1 | ) | const [inline] |
| int TWebPg::GetIps | ( | ) | const [inline] |
| void TWebPg::GetOutDescUrlStrKdV | ( | TStrKdV & | OutDescUrlStrKdV | ) | const |
Definition at line 1258 of file html.cpp.
References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), TChA::Empty(), GetHttpBodyAsStr(), THtmlDoc::GetTok(), THtmlDoc::GetToks(), TUrl::GetUrlStr(), GetUrlStr(), hsyBTag, hsyETag, hsyNum, hsySSym, hsyStr, TUrl::IsOk(), and New().
{
// create outgoing url vector
OutDescUrlStrKdV.Clr();
// take interesting web-page components
TStr UrlStr=GetUrlStr();
TStr HtmlStr=GetHttpBodyAsStr();
// prepare html parsing
PSIn HtmlSIn=TStrIn::New(HtmlStr);
PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
// traverse html documents
PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr;
int TokN=0; int Toks=HtmlDoc->GetToks();
while (TokN<Toks){
Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){
TStr RelUrlStr;
if (Tok->IsUrlTok(RelUrlStr)){
PUrl Url=TUrl::New(RelUrlStr, UrlStr);
if (Url->IsOk()){
TChA DescChA;
while (TokN<Toks){
Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
if ((TokSym==hsyETag)&&(TokStr==THtmlTok::ATagNm)){
break;
} else {
if ((TokSym==hsyStr)||(TokSym==hsyNum)||(TokSym==hsySSym)){
if (!DescChA.Empty()){DescChA+=' ';}
DescChA+=TokStr;
}
}
}
OutDescUrlStrKdV.Add(TStrKd(DescChA, Url->GetUrlStr()));
}
}
}
}
}

| void TWebPg::GetOutUrlV | ( | TUrlV & | OutUrlV, |
| TUrlV & | OutRedirUrlV | ||
| ) | const |
Definition at line 1230 of file html.cpp.
References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), GetHttpBodyAsStr(), THtmlDoc::GetTok(), THtmlDoc::GetToks(), GetUrlStr(), hsyBTag, TUrl::IsOk(), New(), and usHttp.
{
// create outgoing url vector
OutUrlV.Clr(); OutRedirUrlV.Clr();
// take interesting web-page components
TStr UrlStr=GetUrlStr();
TStr HtmlStr=GetHttpBodyAsStr();
// prepare html parsing
PSIn HtmlSIn=TStrIn::New(HtmlStr);
PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
PHtmlTok Tok;
// traverse html
for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){
PHtmlTok Tok=HtmlDoc->GetTok(TokN);
if (Tok->GetSym()==hsyBTag){
TStr RelUrlStr;
if (Tok->IsUrlTok(RelUrlStr)){
PUrl Url=TUrl::New(RelUrlStr, UrlStr);
if (Url->IsOk(usHttp)){
OutUrlV.Add(Url);
if (Tok->IsRedirUrlTok()){
OutRedirUrlV.Add(Url);
}
}
}
}
}
}

| void TWebPg::GetOutUrlV | ( | TUrlV & | OutUrlV | ) | const [inline] |
Definition at line 371 of file html.h.
{
TUrlV OutRedirUrlV; GetOutUrlV(OutUrlV, OutRedirUrlV);}
| PUrl TWebPg::GetUrl | ( | const int & | UrlN = -1 | ) | const [inline] |
| int TWebPg::GetUrls | ( | ) | const [inline] |
| TStr TWebPg::GetUrlStr | ( | const int & | UrlN = -1 | ) | const [inline] |
Definition at line 355 of file html.h.
Referenced by GetOutDescUrlStrKdV(), and GetOutUrlV().

| bool TWebPg::IsTxt | ( | ) | const |
Definition at line 1310 of file html.cpp.
References TCh::CrCh, THttpResp::GetBodyAsStr(), HttpResp, THttpResp::IsContType(), TStr::Len(), TCh::LfCh, TCh::TabCh, and THttp::TextFldVal.
{
if ((!HttpResp->IsContType())||HttpResp->IsContType(THttp::TextFldVal)){
TStr Str=HttpResp->GetBodyAsStr();
int StrLen=Str.Len(); int ChN=0; int PrintChs=0;
while ((ChN<100)&&(ChN<StrLen)){
char Ch=Str[ChN++];
if (((' '<=Ch)&&(Ch<='~'))||(Ch==TCh::TabCh)||(Ch==TCh::LfCh)||(Ch==TCh::CrCh)){
PrintChs++;}
}
double PrintPrb=double(PrintChs)/double(ChN+1);
return PrintPrb>0.9;
} else {
return false;
}
}

| static PWebPg TWebPg::Load | ( | TSIn & | ) | [inline, static] |
| static PWebPg TWebPg::New | ( | const TStrV & | UrlStrV, |
| const TStrV & | IpNumV, | ||
| const PHttpResp & | HttpResp | ||
| ) | [inline, static] |
Definition at line 340 of file html.h.
Referenced by GetOutDescUrlStrKdV(), GetOutUrlV(), SaveAsHttp(), and SaveAsHttpBody().
{
return new TWebPg(UrlStrV, IpNumV, HttpResp);}

| static PWebPg TWebPg::New | ( | const TStrV & | UrlStrV, |
| const PHttpResp & | HttpResp | ||
| ) | [inline, static] |
| static PWebPg TWebPg::New | ( | const TStr & | UrlStr, |
| const PHttpResp & | HttpResp | ||
| ) | [inline, static] |
| void TWebPg::PutFetchMSecs | ( | const uint64 & | _FetchMSecs | ) | [inline] |
Definition at line 376 of file html.h.
{FetchMSecs=_FetchMSecs;}
| void TWebPg::Save | ( | TSOut & | ) | [inline] |
| void TWebPg::SaveAsHttp | ( | const TStr & | FNm | ) | const |
Definition at line 1303 of file html.cpp.
References HttpResp, New(), and THttpResp::SaveTxt().
{
// create output file
PSOut SOut=TFOut::New(FNm);
// save http
HttpResp->SaveTxt(SOut);
}

| void TWebPg::SaveAsHttpBody | ( | const TStr & | FNm | ) | const |
Definition at line 1296 of file html.cpp.
References HttpResp, New(), and THttpResp::SaveBody().
{
// create output file
PSOut SOut=TFOut::New(FNm);
// save http-body
HttpResp->SaveBody(SOut);
}

TCRef TWebPg::CRef [private] |
uint64 TWebPg::FetchMSecs [private] |
PHttpResp TWebPg::HttpResp [private] |
Definition at line 334 of file html.h.
Referenced by IsTxt(), SaveAsHttp(), and SaveAsHttpBody().
TStrV TWebPg::IpNumV [private] |
TStrV TWebPg::UrlStrV [private] |