diff --git a/Data.h b/Data.h new file mode 100644 index 0000000..b1e690f --- /dev/null +++ b/Data.h @@ -0,0 +1,110 @@ +#pragma once + + +#define LIB_PATH "lib/" +#define DATA_PATH "data/" +#define CDN_FILE LIB_PATH "CdnDomainList.dat" +#define URL_FILE LIB_PATH "UrlDomainList.dat" + +#define SPCDN_NAME "spcdn" +#define OTHER_NAME "other" +#define STATIS_NAME "statis" +#define ALL_NAME "all" + +#define STATIS_FILE DATA_PATH STATIS_NAME ".txt" + +#define OUTPUT_INTERVAL 500000 + + +//×Ö¶Îö¾Ù +enum FileForm +{ + e_sip, e_dip, e_domain, e_qtype, e_qcnt, + e_ratio, e_dir, e_auth, e_rval, e_rtype, + e_rcode, e_ttl, e_time, e_qlen, e_rlen, + e_rother, + e_end, +}; + + +//»®·Ö½á¹¹ +struct Partition +{ + ofstream ofs; + long long cnt = 0; + void Init(const string &name, const string &strHead) + { + string formName = name; + auto lbdForm = [](char ch) { + if(IsIdChar(ch)) + return ch; + else + return '_'; + }; + std::transform(SHOW_BEGIN_END(formName), formName.begin(), lbdForm); + ofs.open(name); + ofs <='0' && ch<='9') + || (ch>='a' && ch<='z') + || ch=='-' || ch=='_' || ch=='.'; +} +inline bool FormDomain(string &domain) +{ + //´óдת»»Ð¡Ð´ + std::transform(domain.begin(), domain.end(), domain.begin(), UppCharToLowChar); + //²âÊÔÊÇ·ñÓÐÆæ¹Ö×Ö·û + if(std::find_if_not(domain.begin(), domain.end(), IsDomainChar)!=domain.end()) + return false; + return true; +} + + +//²éÕÒCDNº¯Êý +inline string *FastFindCdn(AcMachine &mtCdn, string &domain) +{ + //´¦Àíºó׺µã + bool bBackDot = domain.back()=='.'; + if(bBackDot) + domain.pop_back(); + //Ôö¼Óкó׺ + domain.push_back('#'); + //Æ¥Åä + auto ret = mtCdn.Judge(SHOW_BEGIN_END(domain)); + domain.pop_back(); + if(bBackDot) + domain.push_back('.'); + return ret; +} + + +//Ìí¼ÓÐÅÏ¢º¯Êý +inline void AddMap(std::map &mapPar, const string &strHead, + const string &key, const string &strLine) +{ + auto res = mapPar.emplace(piecewise_construct, std::tie(key), make_tuple()); + if(res.second) + res.first->second.Init(DATA_PATH+res.first->first, strHead); + res.first->second.ofs <second.cnt; +} + + +//ͳ¼Æº¯Êý +inline void Statistic(ofstream &ofs, std::map &mapPar, long long cntValid) +{ + ofs.seekp(0); + ofs < g_asst(cout, AssertOption::thrw_log); + +int main(int argc, char *argv[]) +{ + + //判断参数 + if(argc<2) { + cout <<"input filename as arg" < mtCdn; + TrieTree mtUrl; + BinReadFile brf(true, true); + g_asst(brf.Open(CDN_FILE) + && (brf >>mtCdn) + && brf.Close(true), + "cant read cdn data\n"); + g_asst(brf.Open(URL_FILE) + && (brf >>mtUrl) + && brf.Close(true), + "cant read url data\n"); + cout <<"read metadata success\n"; + + //组织输出结构 + std::map mapPar; + ofstream ofs(STATIS_FILE); + + //排除行 + cout <<"deal dns start\n"; + string strHead; + long long cntLine= 0, cntForm= 0, cntValid= 0; + constexpr long long limLine = -1; + constexpr bool bOutHead = false; + for(int i=0; i!=1; ++i) { + string str; + std::getline(ifs, str); + if(bOutHead) + strHead < vec; + for(int i=0; i!=e_end; ++i) { + vec.emplace_back(); + if(!std::getline(iss, vec.back(), '\t')) { + break; + } + } + if(vec.size()!=e_end) + continue; + ++ cntForm; + //处理域名 + if(!FormDomain(vec[e_domain])) + continue; + ++ cntValid; + //查找CDN + string &strOut = vec[e_domain]; + string *res; + if((res= mtUrl.Judge(vec[e_domain].rbegin(), vec[e_domain].rend()))) + AddMap(mapPar, strHead, *res, strOut); + else if((res= FastFindCdn(mtCdn, vec[e_domain]))) + AddMap(mapPar, strHead, SPCDN_NAME, strOut); + else + AddMap(mapPar, strHead, OTHER_NAME, strOut); + } + cout <<"\ndeal dns success\n"; + cout <<"cntLine: " <