更新 domain 解析正则匹配错误问题

This commit is contained in:
qidaijie
2019-11-28 11:53:35 +08:00
parent 2efd619314
commit 57fbb78d11
8 changed files with 40 additions and 309 deletions

View File

@@ -1,71 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://ns1</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>file:/opt/hadoop/tmp</value>
</property>
<property>
<name>io.file.buffer.size</name>
<value>131702</value>
</property>
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.logfile.size</name>
<value>10000000</value>
<description>The max size of each log file</description>
</property>
<property>
<name>hadoop.logfile.count</name>
<value>1</value>
<description>The max number of log files</description>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>master:2181,slave1:2181,slave2:2181</value>
</property>
<property>  
    <name>fs.hdfs.impl</name>  
    <value>org.apache.hadoop.hdfs.DistributedFileSystem</value>  
    <description>The FileSystem for hdfs: uris.</description>  
</property>
<property>
<name>io.compression.codecs</name>
<value>com.hadoop.compression.lzo.LzoCodec,com.hadoop.compression.lzo.LzopCodec</value>
</property>
<property>
<name>io.compression.codec.lzo.class</name>
<value>com.hadoop.compression.lzo.LzoCodec</value>
</property>
</configuration>

View File

@@ -1,77 +0,0 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-->
<configuration>
<property>
<name>hbase.rootdir</name>
<value>hdfs://ns1/hbase-1.4.9</value>
</property>
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<property>
<name>hbase.zookeeper.quorum</name>
<value>192.168.40.119,192.168.40.122,192.168.40.123</value>
</property>
<property>
<name>hbase.master.info.port</name>
<value>60010</value>
</property>
<!-- 开启启schema支持 对应hbase的namespace -->
<property>
<name>phoenix.schema.isNamespaceMappingEnabled</name>
<value>true</value>
</property>
<property>
<name>phoenix.schema.mapSystemTablesToNamespace</name>
<value>true</value>
</property>
<property>
<name>hbase.client.keyvalue.maxsize</name>
<value>99428800</value>
</property>
<property>
<name>hbase.server.keyvalue.maxsize</name>
<value>99428800</value>
</property>
<property>
<name>hbase.regionserver.wal.codec</name>
<value>org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec</value>
</property>
<property>
<name>phoenix.query.timeoutMs</name>
<value>1800000</value>
</property>
<property>
<name>hbase.rpc.timeout</name>
<value>1200000</value>
</property>
<property>
<name>hbase.client.scanner.caching</name>
<value>1000</value>
</property>
<property>
<name>hbase.client.scanner.timeout.period</name>
<value>1200000</value>
</property>
</configuration>

View File

@@ -1,116 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/home/ceiec/hadoop/dfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/home/ceiec/hadoop/dfs/data</value>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>192.168.40.119:9001</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.nameservices</name>
<value>ns1</value>
</property>
<property>
<name>dfs.blocksize</name>
<value>134217728</value>
</property>
<property>
<name>dfs.ha.namenodes.ns1</name>
<value>nn1,nn2</value>
</property>
<!-- nn1的RPC通信地址nn1所在地址 -->
<property>
<name>dfs.namenode.rpc-address.ns1.nn1</name>
<value>192.168.40.119:8020</value>
</property>
<!-- nn1的http通信地址外部访问地址 -->
<property>
<name>dfs.namenode.http-address.ns1.nn1</name>
<value>192.168.40.119:50070</value>
</property>
<!-- nn2的RPC通信地址nn2所在地址 -->
<property>
<name>dfs.namenode.rpc-address.ns1.nn2</name>
<value>192.168.40.122:8020</value>
</property>
<!-- nn2的http通信地址外部访问地址 -->
<property>
<name>dfs.namenode.http-address.ns1.nn2</name>
<value>192.168.40.122:50070</value>
</property>
<!-- 指定NameNode的元数据在JournalNode日志上的存放位置(一般和zookeeper部署在一起) -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://192.168.40.119:8485;192.168.40.122:8485;192.168.40.123:8485/ns1</value>
</property>
<!-- 指定JournalNode在本地磁盘存放数据的位置 -->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/home/ceiec/hadoop/journal</value>
</property>
<!--客户端通过代理访问namenode访问文件系统HDFS 客户端与Active 节点通信的Java 类使用其确定Active 节点是否活跃 -->
<property>
<name>dfs.client.failover.proxy.provider.ns1</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!--这是配置自动切换的方法,有多种使用方法,具体可以看官网,在文末会给地址,这里是远程登录杀死的方法 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<!-- 这个是使用sshfence隔离机制时才需要配置ssh免登陆 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
</property>
<!-- 配置sshfence隔离机制超时时间这个属性同上如果你是用脚本的方法切换这个应该是可以不配置的 -->
<property>
<name>dfs.ha.fencing.ssh.connect-timeout</name>
<value>30000</value>
</property>
<!-- 这个是开启自动故障转移,如果你没有自动故障转移,这个可以先不配 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
</configuration>

View File

@@ -1,19 +0,0 @@
#*****************jedis连接参数设置*********************
#redis服务器ip
redis.ip=192.168.40.153
#redis服务器端口号
redis.port=19000
#与服务器建立连接的超时时间
redis.timeout=3000
#************************jedis池参数设置*******************
#jedis的最大活跃连接数
redis.pool.maxActive=200
#jedis最大空闲连接数
redis.pool.maxIdle=5
#jedis池没有连接对象返回时等待可用连接的最大时间单位毫秒默认值为-1表示永不超时。
#如果超过等待时间则直接抛出JedisConnectionException
redis.pool.maxWait=-1
#从池中获取连接的时候,是否进行有效检查
redis.pool.testOnBorrow=true
#归还连接的时候,是否进行有效检查
redis.pool.testOnReturn=true

View File

@@ -10,7 +10,7 @@ public class FlowWriteConfig {
public static final String LOG_STRING_SPLITTER = "\t";
public static final String SQL_STRING_SPLITTER = "#";
public static final String SEGMENTATION = ",";
public static final String DOMAIN_SPLITTER = ".";
/**
* System

View File

@@ -12,6 +12,9 @@ import com.zdjizhi.utils.IpLookup;
import com.zdjizhi.utils.StringUtil;
import org.apache.log4j.Logger;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -25,7 +28,10 @@ import java.util.regex.Pattern;
public class TransFormUtils {
private static Logger logger = Logger.getLogger(TransFormUtils.class);
private static Pattern WEB_PATTERN = Pattern.compile("[^\\\\.]+(\\.com\\.cn|\\.net\\.cn|\\.org\\.cn|\\.gov\\.cn|\\.com|\\.net|\\.cn|\\.org|\\.cc|\\.me|\\.tel|\\.mobi|\\.asia|\\.biz|\\.info|\\.name|\\.tv|\\.hk|\\.公司|\\.中国|\\.网络)");
private final static Set<String> PUBLIC_SUFFIX_SET = new HashSet<String>(
Arrays.asList("com|org|net|gov|edu|co|tv|mobi|info|asia|xxx|onion|cc|cn|com.cn|edu.cn|gov.cn|net.cn|org.cn|jp|kr|tw|com.hk|hk|com.hk|org.hk|se|com.se|org.se"
.split("\\|")));
private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
private static IpLookup ipLookup = new IpLookup.Builder(false)
.loadDataFileV4(FlowWriteConfig.IP_LIBRARY + "Kazakhstan.mmdb")
.loadDataFileV6(FlowWriteConfig.IP_LIBRARY + "Kazakhstan.mmdb")
@@ -156,10 +162,10 @@ public class TransFormUtils {
* @return 顶级域名
*/
private static String getTopDomain(String sni, String host) {
if (StringUtil.isNotBlank(sni)) {
return getDomain(sni);
} else if (StringUtil.isNotBlank(host)) {
return getDomain(host);
if (StringUtil.isNotBlank(host)) {
return getDomainName(host);
} else if (StringUtil.isNotBlank(sni)) {
return getDomainName(sni);
} else {
return "";
}
@@ -169,19 +175,27 @@ public class TransFormUtils {
/**
* 根据url截取顶级域名
*
* @param url 网站url
* @param host 网站url
* @return 顶级域名
*/
private static String getDomain(String url) {
try {
Matcher matcher = WEB_PATTERN.matcher(url);
if (matcher.find()) {
return matcher.group();
}
} catch (Exception e) {
e.printStackTrace();
private static String getDomainName(String host) {
if (host.endsWith(FlowWriteConfig.DOMAIN_SPLITTER)) {
host = host.substring(0, host.length() - 1);
}
return "";
if (IP_PATTERN.matcher(host).matches()) {
return host;
}
int index = 0;
String candidate = host;
for (; index >= 0; ) {
index = candidate.indexOf(FlowWriteConfig.DOMAIN_SPLITTER);
String subCandidate = candidate.substring(index + 1);
if (PUBLIC_SUFFIX_SET.contains(subCandidate)) {
return candidate;
}
candidate = subCandidate;
}
return candidate;
}

View File

@@ -48,16 +48,16 @@ public class HBaseTest {
@Test
public void change() {
Long begin = System.currentTimeMillis();
System.gc();
Long start = Runtime.getRuntime().freeMemory();
System.out.println("开始内存"+start);
getAll();
System.gc();
Long end = Runtime.getRuntime().freeMemory();
System.out.println("结束内存"+end);
System.out.println( "一个HashMap对象占内存: " + (end - start));
System.out.println(System.currentTimeMillis() - begin);
// Long begin = System.currentTimeMillis();
// System.gc();
// Long start = Runtime.getRuntime().freeMemory();
// System.out.println("开始内存"+start);
// getAll();
// System.gc();
// Long end = Runtime.getRuntime().freeMemory();
// System.out.println("结束内存"+end);
// System.out.println( "一个HashMap对象占内存: " + (end - start));
// System.out.println(System.currentTimeMillis() - begin);
}
/**