diff --git a/properties/core-site.xml b/properties/core-site.xml deleted file mode 100644 index 93dfb1d..0000000 --- a/properties/core-site.xml +++ /dev/null @@ -1,71 +0,0 @@ - - - - - - - - - fs.defaultFS - hdfs://ns1 - - - hadoop.tmp.dir - file:/opt/hadoop/tmp - - - io.file.buffer.size - 131702 - - - hadoop.proxyuser.root.hosts - * - - - hadoop.proxyuser.root.groups - * - - - hadoop.logfile.size - 10000000 - The max size of each log file - - - - hadoop.logfile.count - 1 - The max number of log files - - - ha.zookeeper.quorum - master:2181,slave1:2181,slave2:2181 - - -    -     fs.hdfs.impl   -     org.apache.hadoop.hdfs.DistributedFileSystem   -     The FileSystem for hdfs: uris.   - - - -io.compression.codecs -com.hadoop.compression.lzo.LzoCodec,com.hadoop.compression.lzo.LzopCodec - - -io.compression.codec.lzo.class -com.hadoop.compression.lzo.LzoCodec - - - diff --git a/properties/hbase-site.xml b/properties/hbase-site.xml deleted file mode 100644 index 54554e4..0000000 --- a/properties/hbase-site.xml +++ /dev/null @@ -1,77 +0,0 @@ - - - - - - hbase.rootdir - hdfs://ns1/hbase-1.4.9 - - - hbase.cluster.distributed - true - - - hbase.zookeeper.quorum - 192.168.40.119,192.168.40.122,192.168.40.123 - - -hbase.master.info.port -60010 - - - - phoenix.schema.isNamespaceMappingEnabled - true - - - phoenix.schema.mapSystemTablesToNamespace - true - - - hbase.client.keyvalue.maxsize - 99428800 - - - hbase.server.keyvalue.maxsize - 99428800 - - - hbase.regionserver.wal.codec - org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec - - - phoenix.query.timeoutMs - 1800000 - - - hbase.rpc.timeout - 1200000 - - - hbase.client.scanner.caching - 1000 - - - hbase.client.scanner.timeout.period - 1200000 - - diff --git a/properties/hdfs-site.xml b/properties/hdfs-site.xml deleted file mode 100644 index 1e148b7..0000000 --- a/properties/hdfs-site.xml +++ /dev/null @@ -1,116 +0,0 @@ - - - - - - - - - dfs.namenode.name.dir - file:/home/ceiec/hadoop/dfs/name - - - dfs.datanode.data.dir - file:/home/ceiec/hadoop/dfs/data - - - dfs.replication - 2 - - - dfs.namenode.secondary.http-address - 192.168.40.119:9001 - - - dfs.webhdfs.enabled - true - - - dfs.permissions - false - - - dfs.permissions.enabled - false - - - dfs.nameservices - ns1 - - - dfs.blocksize - 134217728 - - - dfs.ha.namenodes.ns1 - nn1,nn2 - - - - dfs.namenode.rpc-address.ns1.nn1 - 192.168.40.119:8020 - - - - dfs.namenode.http-address.ns1.nn1 - 192.168.40.119:50070 - - - - dfs.namenode.rpc-address.ns1.nn2 - 192.168.40.122:8020 - - - - dfs.namenode.http-address.ns1.nn2 - 192.168.40.122:50070 - - - - dfs.namenode.shared.edits.dir - qjournal://192.168.40.119:8485;192.168.40.122:8485;192.168.40.123:8485/ns1 - - - - dfs.journalnode.edits.dir - /home/ceiec/hadoop/journal - - - - dfs.client.failover.proxy.provider.ns1 - org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider - - - - dfs.ha.fencing.methods - sshfence - - - - dfs.ha.fencing.ssh.private-key-files - /root/.ssh/id_rsa - - - - dfs.ha.fencing.ssh.connect-timeout - 30000 - - - - dfs.ha.automatic-failover.enabled - true - - - diff --git a/properties/redis_config.properties b/properties/redis_config.properties deleted file mode 100644 index 3c529de..0000000 --- a/properties/redis_config.properties +++ /dev/null @@ -1,19 +0,0 @@ -#*****************jedis连接参数设置********************* -#redis服务器ip -redis.ip=192.168.40.153 -#redis服务器端口号 -redis.port=19000 -#与服务器建立连接的超时时间 -redis.timeout=3000 -#************************jedis池参数设置******************* -#jedis的最大活跃连接数 -redis.pool.maxActive=200 -#jedis最大空闲连接数 -redis.pool.maxIdle=5 -#jedis池没有连接对象返回时,等待可用连接的最大时间,单位毫秒,默认值为-1,表示永不超时。 -#如果超过等待时间,则直接抛出JedisConnectionException -redis.pool.maxWait=-1 -#从池中获取连接的时候,是否进行有效检查 -redis.pool.testOnBorrow=true -#归还连接的时候,是否进行有效检查 -redis.pool.testOnReturn=true diff --git a/src/main/java/cn/ac/iie/common/FlowWriteConfig.java b/src/main/java/cn/ac/iie/common/FlowWriteConfig.java index cb5fc4c..f91a57a 100644 --- a/src/main/java/cn/ac/iie/common/FlowWriteConfig.java +++ b/src/main/java/cn/ac/iie/common/FlowWriteConfig.java @@ -10,7 +10,7 @@ public class FlowWriteConfig { public static final String LOG_STRING_SPLITTER = "\t"; public static final String SQL_STRING_SPLITTER = "#"; - public static final String SEGMENTATION = ","; + public static final String DOMAIN_SPLITTER = "."; /** * System diff --git a/src/main/java/cn/ac/iie/utils/general/TransFormUtils.java b/src/main/java/cn/ac/iie/utils/general/TransFormUtils.java index 0340776..79e067e 100644 --- a/src/main/java/cn/ac/iie/utils/general/TransFormUtils.java +++ b/src/main/java/cn/ac/iie/utils/general/TransFormUtils.java @@ -12,6 +12,9 @@ import com.zdjizhi.utils.IpLookup; import com.zdjizhi.utils.StringUtil; import org.apache.log4j.Logger; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -25,7 +28,10 @@ import java.util.regex.Pattern; public class TransFormUtils { private static Logger logger = Logger.getLogger(TransFormUtils.class); - private static Pattern WEB_PATTERN = Pattern.compile("[^\\\\.]+(\\.com\\.cn|\\.net\\.cn|\\.org\\.cn|\\.gov\\.cn|\\.com|\\.net|\\.cn|\\.org|\\.cc|\\.me|\\.tel|\\.mobi|\\.asia|\\.biz|\\.info|\\.name|\\.tv|\\.hk|\\.公司|\\.中国|\\.网络)"); + private final static Set PUBLIC_SUFFIX_SET = new HashSet( + Arrays.asList("com|org|net|gov|edu|co|tv|mobi|info|asia|xxx|onion|cc|cn|com.cn|edu.cn|gov.cn|net.cn|org.cn|jp|kr|tw|com.hk|hk|com.hk|org.hk|se|com.se|org.se" + .split("\\|"))); + private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})"); private static IpLookup ipLookup = new IpLookup.Builder(false) .loadDataFileV4(FlowWriteConfig.IP_LIBRARY + "Kazakhstan.mmdb") .loadDataFileV6(FlowWriteConfig.IP_LIBRARY + "Kazakhstan.mmdb") @@ -156,10 +162,10 @@ public class TransFormUtils { * @return 顶级域名 */ private static String getTopDomain(String sni, String host) { - if (StringUtil.isNotBlank(sni)) { - return getDomain(sni); - } else if (StringUtil.isNotBlank(host)) { - return getDomain(host); + if (StringUtil.isNotBlank(host)) { + return getDomainName(host); + } else if (StringUtil.isNotBlank(sni)) { + return getDomainName(sni); } else { return ""; } @@ -169,19 +175,27 @@ public class TransFormUtils { /** * 根据url截取顶级域名 * - * @param url 网站url + * @param host 网站url * @return 顶级域名 */ - private static String getDomain(String url) { - try { - Matcher matcher = WEB_PATTERN.matcher(url); - if (matcher.find()) { - return matcher.group(); - } - } catch (Exception e) { - e.printStackTrace(); + private static String getDomainName(String host) { + if (host.endsWith(FlowWriteConfig.DOMAIN_SPLITTER)) { + host = host.substring(0, host.length() - 1); } - return ""; + if (IP_PATTERN.matcher(host).matches()) { + return host; + } + int index = 0; + String candidate = host; + for (; index >= 0; ) { + index = candidate.indexOf(FlowWriteConfig.DOMAIN_SPLITTER); + String subCandidate = candidate.substring(index + 1); + if (PUBLIC_SUFFIX_SET.contains(subCandidate)) { + return candidate; + } + candidate = subCandidate; + } + return candidate; } diff --git a/src/main/java/cn/ac/iie/utils/hbase/HbaseUtils.java b/src/main/java/cn/ac/iie/utils/hbase/HBaseUtils.java similarity index 100% rename from src/main/java/cn/ac/iie/utils/hbase/HbaseUtils.java rename to src/main/java/cn/ac/iie/utils/hbase/HBaseUtils.java diff --git a/src/test/java/cn/ac/iie/test/hbase/HBaseTest.java b/src/test/java/cn/ac/iie/test/hbase/HBaseTest.java index 5af4124..9e94387 100644 --- a/src/test/java/cn/ac/iie/test/hbase/HBaseTest.java +++ b/src/test/java/cn/ac/iie/test/hbase/HBaseTest.java @@ -48,16 +48,16 @@ public class HBaseTest { @Test public void change() { - Long begin = System.currentTimeMillis(); - System.gc(); - Long start = Runtime.getRuntime().freeMemory(); - System.out.println("开始内存"+start); - getAll(); - System.gc(); - Long end = Runtime.getRuntime().freeMemory(); - System.out.println("结束内存"+end); - System.out.println( "一个HashMap对象占内存: " + (end - start)); - System.out.println(System.currentTimeMillis() - begin); +// Long begin = System.currentTimeMillis(); +// System.gc(); +// Long start = Runtime.getRuntime().freeMemory(); +// System.out.println("开始内存"+start); +// getAll(); +// System.gc(); +// Long end = Runtime.getRuntime().freeMemory(); +// System.out.println("结束内存"+end); +// System.out.println( "一个HashMap对象占内存: " + (end - start)); +// System.out.println(System.currentTimeMillis() - begin); } /**