Update Data.h, DomainDeal, DomainDeal.cpp files

This commit is contained in:
张硕
2019-12-31 11:10:43 +08:00
parent d5d15da57e
commit 6fe2e67322
3 changed files with 211 additions and 0 deletions

110
Data.h Normal file
View File

@@ -0,0 +1,110 @@
#pragma once
#define LIB_PATH "lib/"
#define DATA_PATH "data/"
#define CDN_FILE LIB_PATH "CdnDomainList.dat"
#define URL_FILE LIB_PATH "UrlDomainList.dat"
#define SPCDN_NAME "spcdn"
#define OTHER_NAME "other"
#define STATIS_NAME "statis"
#define ALL_NAME "all"
#define STATIS_FILE DATA_PATH STATIS_NAME ".txt"
#define OUTPUT_INTERVAL 500000
//×Ö¶Îö¾Ù
enum FileForm
{
e_sip, e_dip, e_domain, e_qtype, e_qcnt,
e_ratio, e_dir, e_auth, e_rval, e_rtype,
e_rcode, e_ttl, e_time, e_qlen, e_rlen,
e_rother,
e_end,
};
//»®·Ö½á¹¹
struct Partition
{
ofstream ofs;
long long cnt = 0;
void Init(const string &name, const string &strHead)
{
string formName = name;
auto lbdForm = [](char ch) {
if(IsIdChar(ch))
return ch;
else
return '_';
};
std::transform(SHOW_BEGIN_END(formName), formName.begin(), lbdForm);
ofs.open(name);
ofs <<strHead;
}
};
//¹æ·¶»¯ÓòÃû
inline bool IsDomainChar(char ch)
{
return (ch>='0' && ch<='9')
|| (ch>='a' && ch<='z')
|| ch=='-' || ch=='_' || ch=='.';
}
inline bool FormDomain(string &domain)
{
//´óдת»»Ð¡Ð´
std::transform(domain.begin(), domain.end(), domain.begin(), UppCharToLowChar);
//²âÊÔÊÇ·ñÓÐÆæ¹Ö×Ö·û
if(std::find_if_not(domain.begin(), domain.end(), IsDomainChar)!=domain.end())
return false;
return true;
}
//²éÕÒCDNº¯Êý
inline string *FastFindCdn(AcMachine<char, string> &mtCdn, string &domain)
{
//´¦Àíºó׺µã
bool bBackDot = domain.back()=='.';
if(bBackDot)
domain.pop_back();
//Ôö¼Óкó׺
domain.push_back('#');
//Æ¥Åä
auto ret = mtCdn.Judge(SHOW_BEGIN_END(domain));
domain.pop_back();
if(bBackDot)
domain.push_back('.');
return ret;
}
//Ìí¼ÓÐÅÏ¢º¯Êý
inline void AddMap(std::map<string, Partition> &mapPar, const string &strHead,
const string &key, const string &strLine)
{
auto res = mapPar.emplace(piecewise_construct, std::tie(key), make_tuple());
if(res.second)
res.first->second.Init(DATA_PATH+res.first->first, strHead);
res.first->second.ofs <<strLine <<"\n";
++ res.first->second.cnt;
}
//ͳ¼Æº¯Êý
inline void Statistic(ofstream &ofs, std::map<string, Partition> &mapPar, long long cntValid)
{
ofs.seekp(0);
ofs <<std::fixed;
ofs <<ALL_NAME <<" " <<cntValid <<" " <<"100%\n";
for(auto &pr: mapPar) {
ofs <<pr.first <<" " <<pr.second.cnt <<" " <<(double)pr.second.cnt*100/cntValid <<"%\n";
pr.second.ofs <<flush;
}
ofs <<"end\n\n\n" <<flush;
}

BIN
DomainDeal Normal file

Binary file not shown.

101
DomainDeal.cpp Normal file
View File

@@ -0,0 +1,101 @@
// DomainDeal.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。
//
#include "Common.h"
#include "Data.h"
AssertOperator<> g_asst(cout, AssertOption::thrw_log);
int main(int argc, char *argv[])
{
//判断参数
if(argc<2) {
cout <<"input filename as arg" <<endl;
return -1;
}
//读取文件
ifstream ifs(argv[1]);
if(!ifs.is_open()) {
cout <<"cant open file" <<endl;
return -1;
}
cout <<"read metadata start\n";
//读取元数据
AcMachine<char, string> mtCdn;
TrieTree<char, string> mtUrl;
BinReadFile brf(true, true);
g_asst(brf.Open(CDN_FILE)
&& (brf >>mtCdn)
&& brf.Close(true),
"cant read cdn data\n");
g_asst(brf.Open(URL_FILE)
&& (brf >>mtUrl)
&& brf.Close(true),
"cant read url data\n");
cout <<"read metadata success\n";
//组织输出结构
std::map<string, Partition> mapPar;
ofstream ofs(STATIS_FILE);
//排除行
cout <<"deal dns start\n";
string strHead;
long long cntLine= 0, cntForm= 0, cntValid= 0;
constexpr long long limLine = -1;
constexpr bool bOutHead = false;
for(int i=0; i!=1; ++i) {
string str;
std::getline(ifs, str);
if(bOutHead)
strHead <<str <<"\n";
++ cntLine;
}
//按行读取
for(string strLine; std::getline(ifs, strLine)
&& (limLine<=0 || cntLine<limLine); ++cntLine)
{
//间隔输出中间结果
if(cntLine%OUTPUT_INTERVAL==0) {
Statistic(ofs, mapPar, cntValid);
cout <<"\r" <<cntLine <<" lines" <<flush;
}
//解析字段
istringstream iss(strLine);
vector<string> vec;
for(int i=0; i!=e_end; ++i) {
vec.emplace_back();
if(!std::getline(iss, vec.back(), '\t')) {
break;
}
}
if(vec.size()!=e_end)
continue;
++ cntForm;
//处理域名
if(!FormDomain(vec[e_domain]))
continue;
++ cntValid;
//查找CDN
string &strOut = vec[e_domain];
string *res;
if((res= mtUrl.Judge(vec[e_domain].rbegin(), vec[e_domain].rend())))
AddMap(mapPar, strHead, *res, strOut);
else if((res= FastFindCdn(mtCdn, vec[e_domain])))
AddMap(mapPar, strHead, SPCDN_NAME, strOut);
else
AddMap(mapPar, strHead, OTHER_NAME, strOut);
}
cout <<"\ndeal dns success\n";
cout <<"cntLine: " <<cntLine <<"\n"
<<"cntForm: " <<cntForm <<"\n"
<<"cntValid: " <<cntValid <<"\n";
//最终统计
Statistic(ofs, mapPar, cntValid);
}