diff --git a/properties/core-site.xml b/properties/core-site.xml
deleted file mode 100644
index 93dfb1d..0000000
--- a/properties/core-site.xml
+++ /dev/null
@@ -1,71 +0,0 @@
-
-
-
-
-
-
-
-
- fs.defaultFS
- hdfs://ns1
-
-
- hadoop.tmp.dir
- file:/opt/hadoop/tmp
-
-
- io.file.buffer.size
- 131702
-
-
- hadoop.proxyuser.root.hosts
- *
-
-
- hadoop.proxyuser.root.groups
- *
-
-
- hadoop.logfile.size
- 10000000
- The max size of each log file
-
-
-
- hadoop.logfile.count
- 1
- The max number of log files
-
-
- ha.zookeeper.quorum
- master:2181,slave1:2181,slave2:2181
-
-
-
- fs.hdfs.impl
- org.apache.hadoop.hdfs.DistributedFileSystem
- The FileSystem for hdfs: uris.
-
-
-
-io.compression.codecs
-com.hadoop.compression.lzo.LzoCodec,com.hadoop.compression.lzo.LzopCodec
-
-
-io.compression.codec.lzo.class
-com.hadoop.compression.lzo.LzoCodec
-
-
-
diff --git a/properties/hbase-site.xml b/properties/hbase-site.xml
deleted file mode 100644
index 54554e4..0000000
--- a/properties/hbase-site.xml
+++ /dev/null
@@ -1,77 +0,0 @@
-
-
-
-
-
- hbase.rootdir
- hdfs://ns1/hbase-1.4.9
-
-
- hbase.cluster.distributed
- true
-
-
- hbase.zookeeper.quorum
- 192.168.40.119,192.168.40.122,192.168.40.123
-
-
-hbase.master.info.port
-60010
-
-
-
- phoenix.schema.isNamespaceMappingEnabled
- true
-
-
- phoenix.schema.mapSystemTablesToNamespace
- true
-
-
- hbase.client.keyvalue.maxsize
- 99428800
-
-
- hbase.server.keyvalue.maxsize
- 99428800
-
-
- hbase.regionserver.wal.codec
- org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec
-
-
- phoenix.query.timeoutMs
- 1800000
-
-
- hbase.rpc.timeout
- 1200000
-
-
- hbase.client.scanner.caching
- 1000
-
-
- hbase.client.scanner.timeout.period
- 1200000
-
-
diff --git a/properties/hdfs-site.xml b/properties/hdfs-site.xml
deleted file mode 100644
index 1e148b7..0000000
--- a/properties/hdfs-site.xml
+++ /dev/null
@@ -1,116 +0,0 @@
-
-
-
-
-
-
-
-
- dfs.namenode.name.dir
- file:/home/ceiec/hadoop/dfs/name
-
-
- dfs.datanode.data.dir
- file:/home/ceiec/hadoop/dfs/data
-
-
- dfs.replication
- 2
-
-
- dfs.namenode.secondary.http-address
- 192.168.40.119:9001
-
-
- dfs.webhdfs.enabled
- true
-
-
- dfs.permissions
- false
-
-
- dfs.permissions.enabled
- false
-
-
- dfs.nameservices
- ns1
-
-
- dfs.blocksize
- 134217728
-
-
- dfs.ha.namenodes.ns1
- nn1,nn2
-
-
-
- dfs.namenode.rpc-address.ns1.nn1
- 192.168.40.119:8020
-
-
-
- dfs.namenode.http-address.ns1.nn1
- 192.168.40.119:50070
-
-
-
- dfs.namenode.rpc-address.ns1.nn2
- 192.168.40.122:8020
-
-
-
- dfs.namenode.http-address.ns1.nn2
- 192.168.40.122:50070
-
-
-
- dfs.namenode.shared.edits.dir
- qjournal://192.168.40.119:8485;192.168.40.122:8485;192.168.40.123:8485/ns1
-
-
-
- dfs.journalnode.edits.dir
- /home/ceiec/hadoop/journal
-
-
-
- dfs.client.failover.proxy.provider.ns1
- org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
-
-
-
- dfs.ha.fencing.methods
- sshfence
-
-
-
- dfs.ha.fencing.ssh.private-key-files
- /root/.ssh/id_rsa
-
-
-
- dfs.ha.fencing.ssh.connect-timeout
- 30000
-
-
-
- dfs.ha.automatic-failover.enabled
- true
-
-
-
diff --git a/properties/redis_config.properties b/properties/redis_config.properties
deleted file mode 100644
index 3c529de..0000000
--- a/properties/redis_config.properties
+++ /dev/null
@@ -1,19 +0,0 @@
-#*****************jedis连接参数设置*********************
-#redis服务器ip
-redis.ip=192.168.40.153
-#redis服务器端口号
-redis.port=19000
-#与服务器建立连接的超时时间
-redis.timeout=3000
-#************************jedis池参数设置*******************
-#jedis的最大活跃连接数
-redis.pool.maxActive=200
-#jedis最大空闲连接数
-redis.pool.maxIdle=5
-#jedis池没有连接对象返回时,等待可用连接的最大时间,单位毫秒,默认值为-1,表示永不超时。
-#如果超过等待时间,则直接抛出JedisConnectionException
-redis.pool.maxWait=-1
-#从池中获取连接的时候,是否进行有效检查
-redis.pool.testOnBorrow=true
-#归还连接的时候,是否进行有效检查
-redis.pool.testOnReturn=true
diff --git a/src/main/java/cn/ac/iie/common/FlowWriteConfig.java b/src/main/java/cn/ac/iie/common/FlowWriteConfig.java
index cb5fc4c..f91a57a 100644
--- a/src/main/java/cn/ac/iie/common/FlowWriteConfig.java
+++ b/src/main/java/cn/ac/iie/common/FlowWriteConfig.java
@@ -10,7 +10,7 @@ public class FlowWriteConfig {
public static final String LOG_STRING_SPLITTER = "\t";
public static final String SQL_STRING_SPLITTER = "#";
- public static final String SEGMENTATION = ",";
+ public static final String DOMAIN_SPLITTER = ".";
/**
* System
diff --git a/src/main/java/cn/ac/iie/utils/general/TransFormUtils.java b/src/main/java/cn/ac/iie/utils/general/TransFormUtils.java
index 0340776..79e067e 100644
--- a/src/main/java/cn/ac/iie/utils/general/TransFormUtils.java
+++ b/src/main/java/cn/ac/iie/utils/general/TransFormUtils.java
@@ -12,6 +12,9 @@ import com.zdjizhi.utils.IpLookup;
import com.zdjizhi.utils.StringUtil;
import org.apache.log4j.Logger;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -25,7 +28,10 @@ import java.util.regex.Pattern;
public class TransFormUtils {
private static Logger logger = Logger.getLogger(TransFormUtils.class);
- private static Pattern WEB_PATTERN = Pattern.compile("[^\\\\.]+(\\.com\\.cn|\\.net\\.cn|\\.org\\.cn|\\.gov\\.cn|\\.com|\\.net|\\.cn|\\.org|\\.cc|\\.me|\\.tel|\\.mobi|\\.asia|\\.biz|\\.info|\\.name|\\.tv|\\.hk|\\.公司|\\.中国|\\.网络)");
+ private final static Set PUBLIC_SUFFIX_SET = new HashSet(
+ Arrays.asList("com|org|net|gov|edu|co|tv|mobi|info|asia|xxx|onion|cc|cn|com.cn|edu.cn|gov.cn|net.cn|org.cn|jp|kr|tw|com.hk|hk|com.hk|org.hk|se|com.se|org.se"
+ .split("\\|")));
+ private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
private static IpLookup ipLookup = new IpLookup.Builder(false)
.loadDataFileV4(FlowWriteConfig.IP_LIBRARY + "Kazakhstan.mmdb")
.loadDataFileV6(FlowWriteConfig.IP_LIBRARY + "Kazakhstan.mmdb")
@@ -156,10 +162,10 @@ public class TransFormUtils {
* @return 顶级域名
*/
private static String getTopDomain(String sni, String host) {
- if (StringUtil.isNotBlank(sni)) {
- return getDomain(sni);
- } else if (StringUtil.isNotBlank(host)) {
- return getDomain(host);
+ if (StringUtil.isNotBlank(host)) {
+ return getDomainName(host);
+ } else if (StringUtil.isNotBlank(sni)) {
+ return getDomainName(sni);
} else {
return "";
}
@@ -169,19 +175,27 @@ public class TransFormUtils {
/**
* 根据url截取顶级域名
*
- * @param url 网站url
+ * @param host 网站url
* @return 顶级域名
*/
- private static String getDomain(String url) {
- try {
- Matcher matcher = WEB_PATTERN.matcher(url);
- if (matcher.find()) {
- return matcher.group();
- }
- } catch (Exception e) {
- e.printStackTrace();
+ private static String getDomainName(String host) {
+ if (host.endsWith(FlowWriteConfig.DOMAIN_SPLITTER)) {
+ host = host.substring(0, host.length() - 1);
}
- return "";
+ if (IP_PATTERN.matcher(host).matches()) {
+ return host;
+ }
+ int index = 0;
+ String candidate = host;
+ for (; index >= 0; ) {
+ index = candidate.indexOf(FlowWriteConfig.DOMAIN_SPLITTER);
+ String subCandidate = candidate.substring(index + 1);
+ if (PUBLIC_SUFFIX_SET.contains(subCandidate)) {
+ return candidate;
+ }
+ candidate = subCandidate;
+ }
+ return candidate;
}
diff --git a/src/main/java/cn/ac/iie/utils/hbase/HbaseUtils.java b/src/main/java/cn/ac/iie/utils/hbase/HBaseUtils.java
similarity index 100%
rename from src/main/java/cn/ac/iie/utils/hbase/HbaseUtils.java
rename to src/main/java/cn/ac/iie/utils/hbase/HBaseUtils.java
diff --git a/src/test/java/cn/ac/iie/test/hbase/HBaseTest.java b/src/test/java/cn/ac/iie/test/hbase/HBaseTest.java
index 5af4124..9e94387 100644
--- a/src/test/java/cn/ac/iie/test/hbase/HBaseTest.java
+++ b/src/test/java/cn/ac/iie/test/hbase/HBaseTest.java
@@ -48,16 +48,16 @@ public class HBaseTest {
@Test
public void change() {
- Long begin = System.currentTimeMillis();
- System.gc();
- Long start = Runtime.getRuntime().freeMemory();
- System.out.println("开始内存"+start);
- getAll();
- System.gc();
- Long end = Runtime.getRuntime().freeMemory();
- System.out.println("结束内存"+end);
- System.out.println( "一个HashMap对象占内存: " + (end - start));
- System.out.println(System.currentTimeMillis() - begin);
+// Long begin = System.currentTimeMillis();
+// System.gc();
+// Long start = Runtime.getRuntime().freeMemory();
+// System.out.println("开始内存"+start);
+// getAll();
+// System.gc();
+// Long end = Runtime.getRuntime().freeMemory();
+// System.out.println("结束内存"+end);
+// System.out.println( "一个HashMap对象占内存: " + (end - start));
+// System.out.println(System.currentTimeMillis() - begin);
}
/**