根据自定义arangoRDD重构代码

This commit is contained in:
wanglihui
2020-11-10 16:59:39 +08:00
parent db5ca9db08
commit b62131dacd
14 changed files with 631 additions and 310 deletions

View File

@@ -6,7 +6,6 @@ import cn.ac.iie.utils.ArangoDBConnect;
import cn.ac.iie.utils.ExecutorThreadPool; import cn.ac.iie.utils.ExecutorThreadPool;
import com.arangodb.ArangoCursor; import com.arangodb.ArangoCursor;
import com.arangodb.entity.BaseDocument; import com.arangodb.entity.BaseDocument;
import com.arangodb.entity.BaseEdgeDocument;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@@ -20,15 +19,6 @@ import java.util.concurrent.CountDownLatch;
*/ */
public class BaseArangoData { public class BaseArangoData {
private static final Logger LOG = LoggerFactory.getLogger(BaseArangoData.class); private static final Logger LOG = LoggerFactory.getLogger(BaseArangoData.class);
public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseDocument>> historyVertexFqdnMap = new ConcurrentHashMap<>();
public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseDocument>> historyVertexIpMap = new ConcurrentHashMap<>();
public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseDocument>> historyVertexSubscriberMap = new ConcurrentHashMap<>();
public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseEdgeDocument>> historyRelationFqdnAddressIpMap = new ConcurrentHashMap<>();
public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseEdgeDocument>> historyRelationIpVisitFqdnMap = new ConcurrentHashMap<>();
public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseEdgeDocument>> historyRelationFqdnSameFqdnMap = new ConcurrentHashMap<>();
public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseEdgeDocument>> historyRelationSubsciberLocateIpMap = new ConcurrentHashMap<>();
private static ArangoDBConnect arangoDBConnect = ArangoDBConnect.getInstance(); private static ArangoDBConnect arangoDBConnect = ArangoDBConnect.getInstance();
private ExecutorThreadPool threadPool = ExecutorThreadPool.getInstance(); private ExecutorThreadPool threadPool = ExecutorThreadPool.getInstance();
@@ -82,47 +72,4 @@ public class BaseArangoData {
return "FOR doc IN " + table + " limit "+offsetNum+","+sepNum+" RETURN doc"; return "FOR doc IN " + table + " limit "+offsetNum+","+sepNum+" RETURN doc";
} }
private long[] getTimeRange(String table) {
long minTime = 0L;
long maxTime = 0L;
long startTime = System.currentTimeMillis();
String sql = "LET doc = (FOR doc IN " + table + " RETURN doc) return {max_time:MAX(doc[*].FIRST_FOUND_TIME),min_time:MIN(doc[*].FIRST_FOUND_TIME)}";
switch (ApplicationConfig.ARANGO_TIME_LIMIT_TYPE()) {
case 0:
ArangoCursor<BaseDocument> timeDoc = arangoDBConnect.executorQuery(sql, BaseDocument.class);
try {
if (timeDoc != null) {
while (timeDoc.hasNext()) {
BaseDocument doc = timeDoc.next();
maxTime = Long.parseLong(doc.getAttribute("max_time").toString()) + ApplicationConfig.THREAD_POOL_NUMBER();
minTime = Long.parseLong(doc.getAttribute("min_time").toString());
}
} else {
LOG.warn("获取ArangoDb时间范围为空");
}
} catch (Exception e) {
e.printStackTrace();
}
break;
case 1:
maxTime = ApplicationConfig.READ_ARANGO_MAX_TIME();
minTime = ApplicationConfig.READ_ARANGO_MIN_TIME();
break;
default:
}
long lastTime = System.currentTimeMillis();
LOG.warn(sql + "\n查询最大最小时间用时" + (lastTime - startTime));
return new long[]{minTime, maxTime};
}
private String getQuerySql(long[] timeRange, int threadNumber, String table) {
long minTime = timeRange[0];
long maxTime = timeRange[1];
long diffTime = (maxTime - minTime) / ApplicationConfig.THREAD_POOL_NUMBER();
long maxThreadTime = minTime + (threadNumber + 1) * diffTime;
long minThreadTime = minTime + threadNumber * diffTime;
return "FOR doc IN " + table + " filter doc.FIRST_FOUND_TIME >= " + minThreadTime + " and doc.FIRST_FOUND_TIME <= " + maxThreadTime + " " + ApplicationConfig.ARANGODB_READ_LIMIT() + " RETURN doc";
}
} }

View File

@@ -1,45 +1,39 @@
#spark任务配置 #spark任务配置
spark.sql.shuffle.partitions=5 spark.sql.shuffle.partitions=10
spark.executor.memory=4g spark.executor.memory=4g
spark.app.name=test spark.app.name=test
spark.network.timeout=300s spark.network.timeout=300s
repartitionNumber=36
spark.serializer=org.apache.spark.serializer.KryoSerializer spark.serializer=org.apache.spark.serializer.KryoSerializer
master=local[*] master=local[*]
#spark读取clickhouse配置 #spark读取clickhouse配置
spark.read.clickhouse.url=jdbc:clickhouse://192.168.40.186:8123/tsg_galaxy_v3 spark.read.clickhouse.url=jdbc:clickhouse://192.168.44.67:8123/tsg_galaxy_v3
spark.read.clickhouse.driver=ru.yandex.clickhouse.ClickHouseDriver spark.read.clickhouse.driver=ru.yandex.clickhouse.ClickHouseDriver
spark.read.clickhouse.user=default spark.read.clickhouse.user=default
spark.read.clickhouse.password=111111 spark.read.clickhouse.password=ceiec2019
spark.read.clickhouse.numPartitions=144 spark.read.clickhouse.numPartitions=5
spark.read.clickhouse.fetchsize=10000 spark.read.clickhouse.fetchsize=10000
spark.read.clickhouse.partitionColumn=common_recv_time spark.read.clickhouse.partitionColumn=LAST_FOUND_TIME
clickhouse.socket.timeout=300000 clickhouse.socket.timeout=300000
#arangoDB配置 #arangoDB配置
arangoDB.host=192.168.40.182 arangoDB.host=192.168.40.182
arangoDB.port=8529 arangoDB.port=8529
arangoDB.user=upsert arangoDB.user=upsert
arangoDB.password=ceiec2018 arangoDB.password=ceiec2018
#arangoDB.DB.name=insert_iplearn_index arangoDB.DB.name=ip-learning-test-0
arangoDB.DB.name=iplearn_media_domain #arangoDB.DB.name=iplearn_media_domain
arangoDB.ttl=3600 arangoDB.ttl=3600
thread.pool.number=5 thread.pool.number=10
#读取clickhouse时间范围方式0读取过去一小时1指定时间范围 #读取clickhouse时间范围方式0读取过去一小时1指定时间范围
clickhouse.time.limit.type=0 clickhouse.time.limit.type=1
read.clickhouse.max.time=1571245220 read.clickhouse.max.time=1603785961
read.clickhouse.min.time=1571245210 read.clickhouse.min.time=1603354682
#读取arangoDB时间范围方式0正常读1指定时间范围 arangoDB.read.limit=1
arango.time.limit.type=0
read.arango.max.time=1571245320
read.arango.min.time=1571245200
arangoDB.read.limit=
update.arango.batch=10000 update.arango.batch=10000
distinct.client.ip.num=10000 distinct.client.ip.num=10000
recent.count.hour=24 recent.count.hour=24
update.interval=10800 update.interval=3600

View File

@@ -36,12 +36,7 @@ object ApplicationConfig {
val READ_CLICKHOUSE_MAX_TIME: Long = config.getLong("read.clickhouse.max.time") val READ_CLICKHOUSE_MAX_TIME: Long = config.getLong("read.clickhouse.max.time")
val READ_CLICKHOUSE_MIN_TIME: Long = config.getLong("read.clickhouse.min.time") val READ_CLICKHOUSE_MIN_TIME: Long = config.getLong("read.clickhouse.min.time")
val ARANGO_TIME_LIMIT_TYPE: Int = config.getInt("arango.time.limit.type") val ARANGODB_READ_LIMIT: Int = config.getInt("arangoDB.read.limit")
val READ_ARANGO_MAX_TIME: Long = config.getLong("read.arango.max.time")
val READ_ARANGO_MIN_TIME: Long = config.getLong("read.arango.min.time")
val ARANGODB_READ_LIMIT: String = config.getString("arangoDB.read.limit")
val UPDATE_ARANGO_BATCH: Int = config.getInt("update.arango.batch") val UPDATE_ARANGO_BATCH: Int = config.getInt("update.arango.batch")
val RECENT_COUNT_HOUR: Int = config.getInt("recent.count.hour") val RECENT_COUNT_HOUR: Int = config.getInt("recent.count.hour")
val DISTINCT_CLIENT_IP_NUM: Int = config.getInt("distinct.client.ip.num") val DISTINCT_CLIENT_IP_NUM: Int = config.getInt("distinct.client.ip.num")

View File

@@ -11,7 +11,7 @@ object BaseClickhouseData {
val currentHour: Long = System.currentTimeMillis / (60 * 60 * 1000) * 60 * 60 val currentHour: Long = System.currentTimeMillis / (60 * 60 * 1000) * 60 * 60
private val timeLimit: (Long, Long) = getTimeLimit private val timeLimit: (Long, Long) = getTimeLimit
private def initClickhouseData(sql:String): Unit ={ private def initClickhouseData(sql:String): DataFrame ={
val dataFrame: DataFrame = spark.read.format("jdbc") val dataFrame: DataFrame = spark.read.format("jdbc")
.option("url", ApplicationConfig.SPARK_READ_CLICKHOUSE_URL) .option("url", ApplicationConfig.SPARK_READ_CLICKHOUSE_URL)
@@ -28,6 +28,8 @@ object BaseClickhouseData {
.load() .load()
dataFrame.printSchema() dataFrame.printSchema()
dataFrame.createOrReplaceGlobalTempView("dbtable") dataFrame.createOrReplaceGlobalTempView("dbtable")
dataFrame
} }
def loadConnectionDataFromCk(): Unit ={ def loadConnectionDataFromCk(): Unit ={
@@ -68,41 +70,7 @@ object BaseClickhouseData {
initClickhouseData(sql) initClickhouseData(sql)
} }
def getVertexFqdnDf: DataFrame ={ /*
loadConnectionDataFromCk()
val sql =
"""
|SELECT
| FQDN,MAX( LAST_FOUND_TIME ) AS LAST_FOUND_TIME,MIN( FIRST_FOUND_TIME ) AS FIRST_FOUND_TIME
|FROM
| (
| (SELECT
| ssl_sni AS FQDN,MAX( common_recv_time ) AS LAST_FOUND_TIME,MIN( common_recv_time ) AS FIRST_FOUND_TIME
| FROM
| global_temp.dbtable
| WHERE
| common_schema_type = 'SSL' GROUP BY ssl_sni
| )
| UNION ALL
| (SELECT
| http_host AS FQDN,MAX( common_recv_time ) AS LAST_FOUND_TIME,MIN( common_recv_time ) AS FIRST_FOUND_TIME
| FROM
| global_temp.dbtable
| WHERE
| common_schema_type = 'HTTP' GROUP BY http_host
| )
| )
|GROUP BY
| FQDN
|HAVING
| FQDN != ''
""".stripMargin
LOG.warn(sql)
val vertexFqdnDf = spark.sql(sql)
vertexFqdnDf.printSchema()
vertexFqdnDf
}
def getVertexIpDf: DataFrame ={ def getVertexIpDf: DataFrame ={
loadConnectionDataFromCk() loadConnectionDataFromCk()
val sql = val sql =
@@ -190,6 +158,149 @@ object BaseClickhouseData {
relationFqdnLocateIpDf.printSchema() relationFqdnLocateIpDf.printSchema()
relationFqdnLocateIpDf relationFqdnLocateIpDf
} }
*/
def getVertexFqdnDf: DataFrame ={
val sql =
"""
|(SELECT
| FQDN,MAX( LAST_FOUND_TIME ) AS LAST_FOUND_TIME,MIN( FIRST_FOUND_TIME ) AS FIRST_FOUND_TIME
|FROM
| ((SELECT
| ssl_sni AS FQDN,MAX( common_recv_time ) AS LAST_FOUND_TIME,MIN( common_recv_time ) AS FIRST_FOUND_TIME
| FROM tsg_galaxy_v3.connection_record_log
| WHERE common_schema_type = 'SSL' GROUP BY ssl_sni
| )UNION ALL
| (SELECT
| http_host AS FQDN,MAX( common_recv_time ) AS LAST_FOUND_TIME,MIN( common_recv_time ) AS FIRST_FOUND_TIME
| FROM tsg_galaxy_v3.connection_record_log
| WHERE common_schema_type = 'HTTP' GROUP BY http_host))
|GROUP BY FQDN HAVING FQDN != '') as dbtable
""".stripMargin
LOG.warn(sql)
val frame = initClickhouseData(sql)
frame.printSchema()
frame
}
def getVertexIpDf: DataFrame ={
val where = "common_recv_time >= " + timeLimit._2 + " AND common_recv_time < " + timeLimit._1
val sql =
s"""
|(SELECT * FROM
|((SELECT common_client_ip AS IP,MIN(common_recv_time) AS FIRST_FOUND_TIME,
|MAX(common_recv_time) AS LAST_FOUND_TIME,
|count(*) as SESSION_COUNT,
|SUM(common_c2s_byte_num+common_s2c_byte_num) as BYTES_SUM,
|groupUniqArray(2)(common_link_info_c2s)[2] as common_link_info,
|'client' as ip_type
|FROM tsg_galaxy_v3.connection_record_log
|where $where
|group by common_client_ip)
|UNION ALL
|(SELECT common_server_ip AS IP,
|MIN(common_recv_time) AS FIRST_FOUND_TIME,
|MAX(common_recv_time) AS LAST_FOUND_TIME,
|count(*) as SESSION_COUNT,
|SUM(common_c2s_byte_num+common_s2c_byte_num) as BYTES_SUM,
|groupUniqArray(2)(common_link_info_s2c)[2] as common_link_info,
|'server' as ip_type
|FROM tsg_galaxy_v3.connection_record_log
|where $where
|group by common_server_ip))) as dbtable
""".stripMargin
LOG.warn(sql)
val frame = initClickhouseData(sql)
frame.printSchema()
frame
}
def getRelationFqdnLocateIpDf: DataFrame ={
val where = "common_recv_time >= " + timeLimit._2 + " AND common_recv_time < " + timeLimit._1
val sql =
s"""
|(SELECT * FROM
|((SELECT ssl_sni AS FQDN,common_server_ip,MAX(common_recv_time) AS LAST_FOUND_TIME,MIN(common_recv_time) AS FIRST_FOUND_TIME,COUNT(*) AS COUNT_TOTAL,
|toString(groupUniqArray(${ApplicationConfig.DISTINCT_CLIENT_IP_NUM})(common_client_ip)) AS DIST_CIP_RECENT,'TLS' AS schema_type
|FROM tsg_galaxy_v3.connection_record_log
|WHERE $where and common_schema_type = 'SSL' GROUP BY ssl_sni,common_server_ip)
|UNION ALL
|(SELECT http_host AS FQDN,common_server_ip,MAX(common_recv_time) AS LAST_FOUND_TIME,MIN(common_recv_time) AS FIRST_FOUND_TIME,COUNT(*) AS COUNT_TOTAL,
|toString(groupUniqArray(${ApplicationConfig.DISTINCT_CLIENT_IP_NUM})(common_client_ip)) AS DIST_CIP_RECENT,'HTTP' AS schema_type
|FROM tsg_galaxy_v3.connection_record_log
|WHERE $where and common_schema_type = 'HTTP' GROUP BY http_host,common_server_ip))
|WHERE FQDN != '') as dbtable
""".stripMargin
LOG.warn(sql)
val frame = initClickhouseData(sql)
frame.printSchema()
frame
}
def getRelationSubidLocateIpDf: DataFrame ={
val where =
s"""
| common_recv_time >= ${timeLimit._2}
| AND common_recv_time < ${timeLimit._1}
| AND common_subscriber_id != ''
| AND radius_framed_ip != ''
""".stripMargin
val sql =
s"""
|(
|SELECT common_subscriber_id,radius_framed_ip,MAX(common_recv_time) as LAST_FOUND_TIME,MIN(common_recv_time) as FIRST_FOUND_TIME
|FROM radius_record_log
|WHERE $where GROUP BY common_subscriber_id,radius_framed_ip
|) as dbtable
""".stripMargin
LOG.warn(sql)
val frame = initClickhouseData(sql)
frame.printSchema()
frame
}
def getVertexSubidDf: DataFrame ={
val where =
s"""
| common_recv_time >= ${timeLimit._2}
| AND common_recv_time < ${timeLimit._1}
| AND common_subscriber_id != ''
| AND radius_framed_ip != ''
""".stripMargin
val sql =
s"""
|(
|SELECT common_subscriber_id,MAX(common_recv_time) as LAST_FOUND_TIME,MIN(common_recv_time) as FIRST_FOUND_TIME FROM radius_record_log
|WHERE $where GROUP BY common_subscriber_id
|)as dbtable
""".stripMargin
LOG.warn(sql)
val frame = initClickhouseData(sql)
frame.printSchema()
frame
}
def getVertexFramedIpDf: DataFrame ={
val where =
s"""
| common_recv_time >= ${timeLimit._2}
| AND common_recv_time < ${timeLimit._1}
| AND common_subscriber_id != ''
| AND radius_framed_ip != ''
""".stripMargin
val sql =
s"""
|(
|SELECT DISTINCT radius_framed_ip,common_recv_time as LAST_FOUND_TIME FROM radius_record_log WHERE $where
|)as dbtable
""".stripMargin
LOG.warn(sql)
val frame = initClickhouseData(sql)
frame.printSchema()
frame
}
private def getTimeLimit: (Long,Long) ={ private def getTimeLimit: (Long,Long) ={
var maxTime = 0L var maxTime = 0L

View File

@@ -4,37 +4,57 @@ import java.util.regex.Pattern
import cn.ac.iie.config.ApplicationConfig import cn.ac.iie.config.ApplicationConfig
import cn.ac.iie.dao.BaseClickhouseData import cn.ac.iie.dao.BaseClickhouseData
import cn.ac.iie.spark.ArangoSpark
import cn.ac.iie.spark.partition.CustomPartitioner import cn.ac.iie.spark.partition.CustomPartitioner
import cn.ac.iie.spark.rdd.ReadOptions
import com.arangodb.entity.{BaseDocument, BaseEdgeDocument}
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row import org.apache.spark.sql.Row
import org.apache.spark.sql.functions._ import org.apache.spark.sql.functions._
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
import cn.ac.iie.utils.SparkSessionUtil._
object MergeDataFrame { object MergeDataFrame {
private val LOG = LoggerFactory.getLogger(MergeDataFrame.getClass) private val LOG = LoggerFactory.getLogger(MergeDataFrame.getClass)
private val pattern = Pattern.compile("^[\\d]*$") private val pattern = Pattern.compile("^[\\d]*$")
private val options = ReadOptions(ApplicationConfig.ARANGODB_DB_NAME)
def mergeVertexFqdn(): RDD[Row] ={ def mergeVertexFqdn(): RDD[(String, (Option[BaseDocument], Option[Row]))] ={
BaseClickhouseData.getVertexFqdnDf val fqdnAccmu = getLongAccumulator("FQDN Accumulator")
.rdd.filter(row => isDomain(row.getAs[String](0))).map(row => (row.get(0),row)) val fqdnRddRow = BaseClickhouseData.getVertexFqdnDf
.partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)).values .rdd.filter(row => isDomain(row.getAs[String](0))).map(row => {
fqdnAccmu.add(1)
(row.getAs[String]("FQDN"), row)
}).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS))
fqdnRddRow.cache()
val fqdnRddDoc = ArangoSpark.load[BaseDocument](sparkContext,"FQDN",options)
fqdnRddDoc.map(doc => (doc.getKey, doc)).fullOuterJoin(fqdnRddRow)
} }
def mergeVertexIp(): RDD[Row]={ def mergeVertexIp(): RDD[(String, (Option[BaseDocument], Option[Row]))]={
val ipAccum = getLongAccumulator("IP Accumulator")
val vertexIpDf = BaseClickhouseData.getVertexIpDf val vertexIpDf = BaseClickhouseData.getVertexIpDf
val frame = vertexIpDf.groupBy("IP").agg( val frame = vertexIpDf.groupBy("IP").agg(
min("FIRST_FOUND_TIME").alias("FIRST_FOUND_TIME"), min("FIRST_FOUND_TIME").alias("FIRST_FOUND_TIME"),
max("LAST_FOUND_TIME").alias("LAST_FOUND_TIME"), max("LAST_FOUND_TIME").alias("LAST_FOUND_TIME"),
collect_list("SESSION_COUNT").alias("SESSION_COUNT_LIST"), collect_list("SESSION_COUNT").alias("SESSION_COUNT_LIST"),
collect_list("BYTES_SUM").alias("BYTES_SUM_LIST"), collect_list("BYTES_SUM").alias("BYTES_SUM_LIST"),
collect_list("ip_type").alias("ip_type_list") collect_list("ip_type").alias("ip_type_list"),
last("common_link_info").alias("common_link_info")
) )
val values = frame.rdd.map(row => (row.get(0), row)) val ipRddRow = frame.rdd.map(row => {
.partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)).values ipAccum.add(1)
values (row.getAs[String]("IP"), row)
}).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS))
val ipRddDoc = ArangoSpark.load[BaseDocument](sparkContext,"IP",options)
ipRddDoc.map(doc => (doc.getKey, doc)).fullOuterJoin(ipRddRow)
} }
def mergeRelationFqdnLocateIp(): RDD[Row] ={ def mergeRelationFqdnLocateIp(): RDD[(String, (Option[BaseEdgeDocument], Option[Row]))] ={
val fqdnLocIpAccum = getLongAccumulator("R_LOCATE_FQDN2IP Accumulator")
val frame = BaseClickhouseData.getRelationFqdnLocateIpDf.filter(row => isDomain(row.getAs[String]("FQDN"))) val frame = BaseClickhouseData.getRelationFqdnLocateIpDf.filter(row => isDomain(row.getAs[String]("FQDN")))
.groupBy("FQDN", "common_server_ip") .groupBy("FQDN", "common_server_ip")
.agg( .agg(
@@ -44,13 +64,61 @@ object MergeDataFrame {
collect_list("schema_type").alias("schema_type_list"), collect_list("schema_type").alias("schema_type_list"),
collect_set("DIST_CIP_RECENT").alias("DIST_CIP_RECENT") collect_set("DIST_CIP_RECENT").alias("DIST_CIP_RECENT")
) )
frame.rdd.map(row => { val fqdnLocIpRddRow = frame.rdd.map(row => {
val fqdn = row.getAs[String]("FQDN") val fqdn = row.getAs[String]("FQDN")
val serverIp = row.getAs[String]("common_server_ip") val serverIp = row.getAs[String]("common_server_ip")
val key = fqdn.concat("-" + serverIp) val key = fqdn.concat("-" + serverIp)
fqdnLocIpAccum.add(1)
(key, row) (key, row)
}).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)).values }).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS))
val fqdnLocIpRddDoc = ArangoSpark.load[BaseEdgeDocument](sparkContext,"R_LOCATE_FQDN2IP",options)
fqdnLocIpRddDoc.map(doc => (doc.getKey, doc)).fullOuterJoin(fqdnLocIpRddRow)
}
def mergeRelationSubidLocateIp(): RDD[(String, (Option[BaseEdgeDocument], Option[Row]))] ={
val subidLocIpAccum = getLongAccumulator("R_LOCATE_SUBSCRIBER2IP Accumulator")
val subidLocIpRddRow = BaseClickhouseData.getRelationSubidLocateIpDf
.repartition(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)
.rdd.map(row => {
val commonSubscriberId = row.getAs[String]("common_subscriber_id")
val ip = row.getAs[String]("radius_framed_ip")
val key = commonSubscriberId.concat("-" + ip)
subidLocIpAccum.add(1)
(key, row)
}).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS))
val subidLocIpRddDoc = ArangoSpark.load[BaseEdgeDocument](sparkContext,"R_LOCATE_SUBSCRIBER2IP",options)
subidLocIpRddDoc.map(doc => (doc.getKey, doc)).fullOuterJoin(subidLocIpRddRow)
}
def mergeVertexSubid(): RDD[(String, (Option[BaseDocument], Option[Row]))] ={
val subidAccum = getLongAccumulator("SUBSCRIBER Accumulator")
val subidRddRow = BaseClickhouseData.getVertexSubidDf
.repartition(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)
.rdd.map(row => {
val commonSubscriberId = row.getAs[String]("common_subscriber_id")
subidAccum.add(1)
(commonSubscriberId, row)
}).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS))
val subidRddDoc = ArangoSpark.load[BaseDocument](sparkContext,"SUBSCRIBER",options)
subidRddDoc.map(doc => (doc.getKey, doc)).fullOuterJoin(subidRddRow)
}
def mergeVertexFrameIp: RDD[Row] ={
val framedIpAccum = getLongAccumulator("framed ip Accumulator")
val values = BaseClickhouseData.getVertexFramedIpDf
.repartition(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)
.rdd.map(row => {
val ip = row.getAs[String]("radius_framed_ip")
framedIpAccum.add(1)
(ip, row)
}).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)).values
values
} }
private def isDomain(fqdn: String): Boolean = { private def isDomain(fqdn: String): Boolean = {
@@ -58,14 +126,10 @@ object MergeDataFrame {
if (fqdn == null || fqdn.length == 0) { if (fqdn == null || fqdn.length == 0) {
return false return false
} }
if (fqdn.contains(":")) {
val s = fqdn.split(":")(0) val fqdnArr = fqdn.split(":")(0).split("\\.")
if (s.contains(":")){
return false if (fqdnArr.length != 4){
}
}
val fqdnArr = fqdn.split("\\.")
if (fqdnArr.length < 4 || fqdnArr.length > 4){
return true return true
} }
for (f <- fqdnArr) { for (f <- fqdnArr) {
@@ -83,6 +147,7 @@ object MergeDataFrame {
LOG.error("解析域名 " + fqdn + " 失败:\n" + e.toString) LOG.error("解析域名 " + fqdn + " 失败:\n" + e.toString)
} }
false false
} }
} }

View File

@@ -1,7 +1,8 @@
package cn.ac.iie.service.update package cn.ac.iie.service.update
import java.lang import java.util
import scala.collection.JavaConversions._
import cn.ac.iie.config.ApplicationConfig import cn.ac.iie.config.ApplicationConfig
import cn.ac.iie.service.read.ReadHistoryArangoData import cn.ac.iie.service.read.ReadHistoryArangoData
@@ -14,17 +15,25 @@ object UpdateDocHandler {
val PROTOCOL_SET: Set[String] = Set("HTTP","TLS","DNS") val PROTOCOL_SET: Set[String] = Set("HTTP","TLS","DNS")
def updateMaxAttribute(hisDoc: BaseDocument,newAttribute:Long,attributeName:String): Unit ={ def updateMaxAttribute(hisDoc: BaseDocument,newAttribute:Long,attributeName:String): Unit ={
if(hisDoc.getProperties.containsKey(attributeName)){
var hisAttritube = hisDoc.getAttribute(attributeName).toString.toLong var hisAttritube = hisDoc.getAttribute(attributeName).toString.toLong
if (newAttribute > hisAttritube){ if (newAttribute > hisAttritube){
hisAttritube = newAttribute hisAttritube = newAttribute
} }
hisDoc.addAttribute(attributeName,hisAttritube) hisDoc.addAttribute(attributeName,hisAttritube)
} }
}
def updateSumAttribute(hisDoc: BaseDocument,newAttribute:Long,attributeName:String): Unit ={ def updateSumAttribute(hisDoc: BaseDocument,newAttribute:Long,attributeName:String): Unit ={
if (hisDoc.getProperties.containsKey(attributeName)){
val hisAttritube = hisDoc.getAttribute(attributeName).toString.toLong val hisAttritube = hisDoc.getAttribute(attributeName).toString.toLong
hisDoc.addAttribute(attributeName,newAttribute+hisAttritube) hisDoc.addAttribute(attributeName,newAttribute+hisAttritube)
} }
}
def replaceAttribute(hisDoc: BaseDocument,newAttribute:String,attributeName:String): Unit ={
hisDoc.addAttribute(attributeName,newAttribute)
}
def separateAttributeByIpType(ipTypeList:ofRef[String], def separateAttributeByIpType(ipTypeList:ofRef[String],
sessionCountList:ofRef[AnyRef], sessionCountList:ofRef[AnyRef],
@@ -62,20 +71,22 @@ object UpdateDocHandler {
} }
def updateProtocolAttritube(hisDoc:BaseEdgeDocument, protocolMap: Map[String, Long]): Unit ={ def updateProtocolAttritube(hisDoc:BaseEdgeDocument, protocolMap: Map[String, Long]): Unit ={
if (hisDoc.getProperties.containsKey("PROTOCOL_TYPE")){
var protocolType = hisDoc.getAttribute("PROTOCOL_TYPE").toString var protocolType = hisDoc.getAttribute("PROTOCOL_TYPE").toString
protocolMap.foreach(t => { protocolMap.foreach((t: (String, Long)) => {
if (t._2 > 0 && !protocolType.contains(t._1)){ if (t._2 > 0 && !protocolType.contains(t._1)){
protocolType = protocolType.concat(","+ t._1) protocolType = protocolType.concat(","+ t._1)
} }
val cntTotalName = t._1.concat("_CNT_TOTAL") val cntTotalName = t._1.concat("_CNT_TOTAL")
val cntRecentName = t._1.concat("_CNT_RECENT") val cntRecentName = t._1.concat("_CNT_RECENT")
val cntRecent: Array[lang.Long] = hisDoc.getAttribute(cntRecentName).asInstanceOf[Array[java.lang.Long]] val cntRecent = hisDoc.getAttribute(cntRecentName).asInstanceOf[Array[Long]]
cntRecent.update(0,t._2) cntRecent.update(0,t._2)
updateSumAttribute(hisDoc,t._2,cntTotalName) updateSumAttribute(hisDoc,t._2,cntTotalName)
hisDoc.addAttribute(cntRecentName,cntRecent) hisDoc.addAttribute(cntRecentName,cntRecent)
}) })
hisDoc.addAttribute("PROTOCOL_TYPE",protocolType) hisDoc.addAttribute("PROTOCOL_TYPE",protocolType)
} }
}
def putProtocolAttritube(doc:BaseEdgeDocument, protocolMap: Map[String, Long]): Unit ={ def putProtocolAttritube(doc:BaseEdgeDocument, protocolMap: Map[String, Long]): Unit ={
val protocolTypeBuilder = new mutable.StringBuilder() val protocolTypeBuilder = new mutable.StringBuilder()
@@ -93,9 +104,29 @@ object UpdateDocHandler {
doc.addAttribute("PROTOCOL_TYPE",protocolTypeBuilder.toString().replaceFirst(",","")) doc.addAttribute("PROTOCOL_TYPE",protocolTypeBuilder.toString().replaceFirst(",",""))
} }
def mergeDistinctIp(distCipRecent:ofRef[ofRef[String]]): Array[String] ={ def updateProtocolDocument(doc: BaseEdgeDocument): Unit = {
distCipRecent.flatten.distinct.take(ApplicationConfig.DISTINCT_CLIENT_IP_NUM).toArray if (doc.getProperties.containsKey("PROTOCOL_TYPE")) {
for (protocol <- PROTOCOL_SET) {
val protocolRecent = protocol + "_CNT_RECENT"
val cntRecent: util.ArrayList[Long] = doc.getAttribute(protocolRecent).asInstanceOf[util.ArrayList[Long]]
val cntRecentsSrc = cntRecent.toArray().map(_.toString.toLong)
val cntRecentsDst = new Array[Long](24)
System.arraycopy(cntRecentsSrc, 0, cntRecentsDst, 1, cntRecentsSrc.length - 1)
cntRecentsDst(0) = 0L
doc.addAttribute(protocolRecent, cntRecentsDst)
} }
}
}
def mergeDistinctIp(distCipRecent:ofRef[String]): Array[String] ={
distCipRecent.flatMap(str => {
str.replaceAll("\\[","")
.replaceAll("\\]","")
.replaceAll("\\'","")
.split(",")
}).distinct.take(ApplicationConfig.DISTINCT_CLIENT_IP_NUM).toArray
}
def putDistinctIp(doc:BaseEdgeDocument,newDistinctIp:Array[String]): Unit ={ def putDistinctIp(doc:BaseEdgeDocument,newDistinctIp:Array[String]): Unit ={
val map = newDistinctIp.map(ip => { val map = newDistinctIp.map(ip => {
@@ -106,8 +137,9 @@ object UpdateDocHandler {
} }
def updateDistinctIp(hisDoc:BaseEdgeDocument,newDistinctIp:Array[String]): Unit ={ def updateDistinctIp(hisDoc:BaseEdgeDocument,newDistinctIp:Array[String]): Unit ={
val hisDistCip = hisDoc.getAttribute("DIST_CIP").asInstanceOf[Array[String]] if (hisDoc.getProperties.containsKey("DIST_CIP") && hisDoc.getProperties.containsKey("DIST_CIP_TS")){
val hisDistCipTs = hisDoc.getAttribute("DIST_CIP_TS").asInstanceOf[Array[Long]] val hisDistCip = hisDoc.getAttribute("DIST_CIP").asInstanceOf[util.ArrayList[String]]
val hisDistCipTs = hisDoc.getAttribute("DIST_CIP_TS").asInstanceOf[util.ArrayList[Long]]
if (hisDistCip.length == hisDistCipTs.length){ if (hisDistCip.length == hisDistCipTs.length){
val distCipToTsMap: Map[String, Long] = hisDistCip.zip(hisDistCipTs).toMap val distCipToTsMap: Map[String, Long] = hisDistCip.zip(hisDistCipTs).toMap
val muDistCipToTsMap: mutable.Map[String, Long] = mutable.Map(distCipToTsMap.toSeq:_*) val muDistCipToTsMap: mutable.Map[String, Long] = mutable.Map(distCipToTsMap.toSeq:_*)
@@ -119,5 +151,6 @@ object UpdateDocHandler {
hisDoc.addAttribute("DIST_CIP_TS",resultMap.values.toArray) hisDoc.addAttribute("DIST_CIP_TS",resultMap.values.toArray)
} }
} }
}
} }

View File

@@ -1,17 +1,12 @@
package cn.ac.iie.service.update package cn.ac.iie.service.update
import java.util import java.util
import java.util.concurrent.ConcurrentHashMap
import cn.ac.iie.config.ApplicationConfig import cn.ac.iie.config.ApplicationConfig
import cn.ac.iie.dao.BaseArangoData
import cn.ac.iie.dao.BaseArangoData._
import cn.ac.iie.service.transform.MergeDataFrame._ import cn.ac.iie.service.transform.MergeDataFrame._
import cn.ac.iie.service.update.UpdateDocHandler._ import cn.ac.iie.service.update.UpdateDocHandler._
import cn.ac.iie.utils.{ArangoDBConnect, ExecutorThreadPool, SparkSessionUtil} import cn.ac.iie.utils.{ArangoDBConnect, SparkSessionUtil}
import cn.ac.iie.utils.SparkSessionUtil.spark
import com.arangodb.entity.{BaseDocument, BaseEdgeDocument} import com.arangodb.entity.{BaseDocument, BaseEdgeDocument}
import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row import org.apache.spark.sql.Row
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
@@ -19,44 +14,47 @@ import org.slf4j.LoggerFactory
import scala.collection.mutable.WrappedArray.ofRef import scala.collection.mutable.WrappedArray.ofRef
object UpdateDocument { object UpdateDocument {
private val pool = ExecutorThreadPool.getInstance
private val arangoManger: ArangoDBConnect = ArangoDBConnect.getInstance() private val arangoManger: ArangoDBConnect = ArangoDBConnect.getInstance()
private val LOG = LoggerFactory.getLogger(UpdateDocument.getClass) private val LOG = LoggerFactory.getLogger(UpdateDocument.getClass)
private val baseArangoData = new BaseArangoData()
def update(): Unit = { def update(): Unit = {
try { try {
updateDocument("FQDN", historyVertexFqdnMap, getVertexFqdnRow, classOf[BaseDocument], mergeVertexFqdn) updateDocument("FQDN", getVertexFqdnRow, mergeVertexFqdn)
updateDocument("IP", historyVertexIpMap, getVertexIpRow, classOf[BaseDocument], mergeVertexIp)
updateDocument("R_LOCATE_FQDN2IP", historyRelationFqdnAddressIpMap, getRelationFqdnLocateIpRow, classOf[BaseEdgeDocument], mergeRelationFqdnLocateIp) updateDocument("SUBSCRIBER",getVertexSubidRow,mergeVertexSubid)
insertFrameIp()
updateDocument("R_LOCATE_SUBSCRIBER2IP",getRelationSubidLocateIpRow,mergeRelationSubidLocateIp)
updateDocument("R_LOCATE_FQDN2IP", getRelationFqdnLocateIpRow, mergeRelationFqdnLocateIp)
updateDocument("IP", getVertexIpRow, mergeVertexIp)
} catch { } catch {
case e: Exception => e.printStackTrace() case e: Exception => e.printStackTrace()
} finally { } finally {
pool.shutdown()
arangoManger.clean() arangoManger.clean()
SparkSessionUtil.closeSpark() SparkSessionUtil.closeSpark()
System.exit(0)
} }
} }
private def updateDocument[T <: BaseDocument](collName: String, private def updateDocument[T <: BaseDocument](collName: String,
historyMap: ConcurrentHashMap[Integer, ConcurrentHashMap[String, T]], getDocumentRow: ((String, (Option[T], Option[Row]))) => T,
getDocumentRow: (Row, ConcurrentHashMap[String, T]) => T, getJoinRdd: () => RDD[(String, (Option[T], Option[Row]))]
clazz: Class[T],
getNewDataRdd: () => RDD[Row]
): Unit = { ): Unit = {
baseArangoData.readHistoryData(collName, historyMap, clazz)
val hisBc = spark.sparkContext.broadcast(historyMap)
try { try {
val start = System.currentTimeMillis() val start = System.currentTimeMillis()
val newDataRdd = getNewDataRdd() val joinRdd = getJoinRdd()
newDataRdd.foreachPartition(iter => { joinRdd.foreachPartition(iter => {
val partitionId: Int = TaskContext.get.partitionId
val dictionaryMap: ConcurrentHashMap[String, T] = hisBc.value.get(partitionId)
val resultDocumentList = new util.ArrayList[T] val resultDocumentList = new util.ArrayList[T]
var i = 0 var i = 0
iter.foreach(row => { iter.foreach(row => {
val document = getDocumentRow(row, dictionaryMap) val document = getDocumentRow(row)
if (document != null){
resultDocumentList.add(document) resultDocumentList.add(document)
}
i += 1 i += 1
if (i >= ApplicationConfig.UPDATE_ARANGO_BATCH) { if (i >= ApplicationConfig.UPDATE_ARANGO_BATCH) {
arangoManger.overwrite(resultDocumentList, collName) arangoManger.overwrite(resultDocumentList, collName)
@@ -73,88 +71,238 @@ object UpdateDocument {
LOG.warn(s"更新$collName 时间:${last - start}") LOG.warn(s"更新$collName 时间:${last - start}")
} catch { } catch {
case e: Exception => e.printStackTrace() case e: Exception => e.printStackTrace()
} finally {
hisBc.destroy()
} }
} }
private def getVertexFqdnRow(row: Row, dictionaryMap: ConcurrentHashMap[String, BaseDocument]): BaseDocument = { private def insertFrameIp(): Unit ={
val fqdn = row.getAs[String]("FQDN") mergeVertexFrameIp.foreachPartition(iter => {
val lastFoundTime = row.getAs[Long]("LAST_FOUND_TIME") val resultDocumentList = new util.ArrayList[BaseDocument]
val firstFoundTime = row.getAs[Long]("FIRST_FOUND_TIME") var i = 0
var document: BaseDocument = dictionaryMap.getOrDefault(fqdn, null) iter.foreach(row => {
if (document != null) { val document = getVertexFrameipRow(row)
updateMaxAttribute(document, lastFoundTime, "LAST_FOUND_TIME") resultDocumentList.add(document)
} else { i += 1
document = new BaseDocument if (i >= ApplicationConfig.UPDATE_ARANGO_BATCH) {
document.setKey(fqdn) arangoManger.overwrite(resultDocumentList, "IP")
document.addAttribute("FQDN_NAME", fqdn) LOG.warn(s"更新:IP" + i)
document.addAttribute("FIRST_FOUND_TIME", firstFoundTime) i = 0
document.addAttribute("LAST_FOUND_TIME", lastFoundTime)
} }
document })
if (i != 0) {
arangoManger.overwrite(resultDocumentList, "IP")
LOG.warn(s"更新IP:" + i)
}
})
} }
private def getVertexIpRow(row: Row, dictionaryMap: ConcurrentHashMap[String, BaseDocument]): BaseDocument = { private def getVertexFrameipRow(row: Row): BaseDocument ={
val ip = row.getAs[String]("IP") val ip = row.getAs[String]("radius_framed_ip")
val firstFoundTime = row.getAs[Long]("FIRST_FOUND_TIME") val document = new BaseDocument()
val lastFoundTime = row.getAs[Long]("LAST_FOUND_TIME")
val sessionCountList = row.getAs[ofRef[AnyRef]]("SESSION_COUNT_LIST")
val bytesSumList = row.getAs[ofRef[AnyRef]]("BYTES_SUM_LIST")
val ipTypeList = row.getAs[ofRef[String]]("ip_type_list")
val sepAttributeTuple = separateAttributeByIpType(ipTypeList, sessionCountList, bytesSumList)
var document = dictionaryMap.getOrDefault(ip, null)
if (document != null) {
updateMaxAttribute(document, lastFoundTime, "LAST_FOUND_TIME")
updateSumAttribute(document, sepAttributeTuple._1, "SERVER_SESSION_COUNT")
updateSumAttribute(document, sepAttributeTuple._2, "SERVER_BYTES_SUM")
updateSumAttribute(document, sepAttributeTuple._3, "CLIENT_SESSION_COUNT")
updateSumAttribute(document, sepAttributeTuple._4, "CLIENT_BYTES_SUM")
} else {
document = new BaseDocument
document.setKey(ip) document.setKey(ip)
document.addAttribute("IP",ip) document.addAttribute("IP",ip)
document.addAttribute("FIRST_FOUND_TIME", firstFoundTime)
document.addAttribute("LAST_FOUND_TIME", lastFoundTime)
document.addAttribute("SERVER_SESSION_COUNT", sepAttributeTuple._1)
document.addAttribute("SERVER_BYTES_SUM", sepAttributeTuple._2)
document.addAttribute("CLIENT_SESSION_COUNT", sepAttributeTuple._3)
document.addAttribute("CLIENT_BYTES_SUM", sepAttributeTuple._4)
document.addAttribute("COMMON_LINK_INFO", "")
}
document document
} }
private def getRelationFqdnLocateIpRow(row: Row, dictionaryMap: ConcurrentHashMap[String, BaseEdgeDocument]): BaseEdgeDocument = { private def getRelationSubidLocateIpRow(joinRow: (String, (Option[BaseEdgeDocument], Option[Row]))): BaseEdgeDocument ={
val fqdn = row.getAs[String]("FQDN")
val serverIp = row.getAs[String]("common_server_ip") val subidLocIpDocOpt = joinRow._2._1
val firstFoundTime = row.getAs[Long]("FIRST_FOUND_TIME") var subidLocIpDoc = subidLocIpDocOpt match {
val lastFoundTime = row.getAs[Long]("LAST_FOUND_TIME") case Some(doc) => doc
val countTotalList = row.getAs[ofRef[AnyRef]]("COUNT_TOTAL_LIST") case None => null
val schemaTypeList = row.getAs[ofRef[AnyRef]]("schema_type_list") }
val distCipRecent = row.getAs[ofRef[ofRef[String]]]("DIST_CIP_RECENT")
val subidLocIpRowOpt = joinRow._2._2
val subidLocIpRow = subidLocIpRowOpt match {
case Some(r) => r
case None => null
}
if (subidLocIpRow != null){
val subId = subidLocIpRow.getAs[String]("common_subscriber_id")
val ip = subidLocIpRow.getAs[String]("radius_framed_ip")
val lastFoundTime = subidLocIpRow.getAs[Long]("LAST_FOUND_TIME")
val firstFoundTime = subidLocIpRow.getAs[Long]("FIRST_FOUND_TIME")
val key = subId.concat("-"+ip)
if (subidLocIpDoc != null){
updateMaxAttribute(subidLocIpDoc,lastFoundTime,"LAST_FOUND_TIME")
} else {
subidLocIpDoc = new BaseEdgeDocument()
subidLocIpDoc.setKey(key)
subidLocIpDoc.setFrom("SUBSCRIBER/" + subId)
subidLocIpDoc.setTo("IP/" + ip)
subidLocIpDoc.addAttribute("SUBSCRIBER",subId)
subidLocIpDoc.addAttribute("IP",ip)
subidLocIpDoc.addAttribute("FIRST_FOUND_TIME",firstFoundTime)
subidLocIpDoc.addAttribute("LAST_FOUND_TIME",lastFoundTime)
}
}
subidLocIpDoc
}
private def getVertexSubidRow(joinRow: (String, (Option[BaseDocument], Option[Row]))): BaseDocument ={
val subidDocOpt = joinRow._2._1
var subidDoc = subidDocOpt match {
case Some(doc) => doc
case None => null
}
val subidRowOpt = joinRow._2._2
val subidRow = subidRowOpt match {
case Some(r) => r
case None => null
}
if (subidRow != null){
val subId = subidRow.getAs[String]("common_subscriber_id")
val subLastFoundTime = subidRow.getAs[Long]("LAST_FOUND_TIME")
val subFirstFoundTime = subidRow.getAs[Long]("FIRST_FOUND_TIME")
if (subidDoc != null){
updateMaxAttribute(subidDoc,subLastFoundTime,"LAST_FOUND_TIME")
} else {
subidDoc = new BaseDocument()
subidDoc.setKey(subId)
subidDoc.addAttribute("SUBSCRIBER",subId)
subidDoc.addAttribute("FIRST_FOUND_TIME",subFirstFoundTime)
subidDoc.addAttribute("LAST_FOUND_TIME",subLastFoundTime)
}
}
subidDoc
}
private def getVertexFqdnRow(joinRow: (String, (Option[BaseDocument], Option[Row]))): BaseDocument = {
val fqdnDocOpt = joinRow._2._1
var fqdnDoc = fqdnDocOpt match {
case Some(doc) => doc
case None => null
}
val fqdnRowOpt = joinRow._2._2
val fqdnRow = fqdnRowOpt match {
case Some(r) => r
case None => null
}
if (fqdnRow != null){
val fqdn = fqdnRow.getAs[String]("FQDN")
val lastFoundTime = fqdnRow.getAs[Long]("LAST_FOUND_TIME")
val firstFoundTime = fqdnRow.getAs[Long]("FIRST_FOUND_TIME")
if (fqdnDoc != null) {
updateMaxAttribute(fqdnDoc, lastFoundTime, "LAST_FOUND_TIME")
} else {
fqdnDoc = new BaseDocument
fqdnDoc.setKey(fqdn)
fqdnDoc.addAttribute("FQDN_NAME", fqdn)
fqdnDoc.addAttribute("FIRST_FOUND_TIME", firstFoundTime)
fqdnDoc.addAttribute("LAST_FOUND_TIME", lastFoundTime)
}
}
fqdnDoc
}
private def getVertexIpRow(joinRow: (String, (Option[BaseDocument], Option[Row]))): BaseDocument = {
val ipDocOpt = joinRow._2._1
var ipDoc = ipDocOpt match {
case Some(doc) => doc
case None => null
}
val ipRowOpt = joinRow._2._2
val ipRow = ipRowOpt match {
case Some(r) => r
case None => null
}
if (ipRow != null){
val ip = ipRow.getAs[String]("IP")
val firstFoundTime = ipRow.getAs[Long]("FIRST_FOUND_TIME")
val lastFoundTime = ipRow.getAs[Long]("LAST_FOUND_TIME")
val sessionCountList = ipRow.getAs[ofRef[AnyRef]]("SESSION_COUNT_LIST")
val bytesSumList = ipRow.getAs[ofRef[AnyRef]]("BYTES_SUM_LIST")
val ipTypeList = ipRow.getAs[ofRef[String]]("ip_type_list")
val linkInfo = ipRow.getAs[String]("common_link_info")
val sepAttributeTuple = separateAttributeByIpType(ipTypeList, sessionCountList, bytesSumList)
if (ipDoc != null) {
updateMaxAttribute(ipDoc, lastFoundTime, "LAST_FOUND_TIME")
updateSumAttribute(ipDoc, sepAttributeTuple._1, "SERVER_SESSION_COUNT")
updateSumAttribute(ipDoc, sepAttributeTuple._2, "SERVER_BYTES_SUM")
updateSumAttribute(ipDoc, sepAttributeTuple._3, "CLIENT_SESSION_COUNT")
updateSumAttribute(ipDoc, sepAttributeTuple._4, "CLIENT_BYTES_SUM")
replaceAttribute(ipDoc,linkInfo,"COMMON_LINK_INFO")
} else {
ipDoc = new BaseDocument
ipDoc.setKey(ip)
ipDoc.addAttribute("IP", ip)
ipDoc.addAttribute("FIRST_FOUND_TIME", firstFoundTime)
ipDoc.addAttribute("LAST_FOUND_TIME", lastFoundTime)
ipDoc.addAttribute("SERVER_SESSION_COUNT", sepAttributeTuple._1)
ipDoc.addAttribute("SERVER_BYTES_SUM", sepAttributeTuple._2)
ipDoc.addAttribute("CLIENT_SESSION_COUNT", sepAttributeTuple._3)
ipDoc.addAttribute("CLIENT_BYTES_SUM", sepAttributeTuple._4)
ipDoc.addAttribute("COMMON_LINK_INFO", "")
}
}
ipDoc
}
private def getRelationFqdnLocateIpRow(joinRow: (String, (Option[BaseEdgeDocument], Option[Row]))): BaseEdgeDocument = {
val fqdnLocIpDocOpt = joinRow._2._1
var fqdnLocIpDoc = fqdnLocIpDocOpt match {
case Some(doc) => doc
case None => null
}
val fqdnLocIpRowOpt = joinRow._2._2
val fqdnLocIpRow = fqdnLocIpRowOpt match {
case Some(r) => r
case None => null
}
if (fqdnLocIpDoc != null){
updateProtocolDocument(fqdnLocIpDoc)
}
if (fqdnLocIpRow != null){
val fqdn = fqdnLocIpRow.getAs[String]("FQDN")
val serverIp = fqdnLocIpRow.getAs[String]("common_server_ip")
val firstFoundTime = fqdnLocIpRow.getAs[Long]("FIRST_FOUND_TIME")
val lastFoundTime = fqdnLocIpRow.getAs[Long]("LAST_FOUND_TIME")
val countTotalList = fqdnLocIpRow.getAs[ofRef[AnyRef]]("COUNT_TOTAL_LIST")
val schemaTypeList = fqdnLocIpRow.getAs[ofRef[AnyRef]]("schema_type_list")
val distCipRecent = fqdnLocIpRow.getAs[ofRef[String]]("DIST_CIP_RECENT")
val sepAttritubeMap: Map[String, Long] = separateAttributeByProtocol(schemaTypeList, countTotalList) val sepAttritubeMap: Map[String, Long] = separateAttributeByProtocol(schemaTypeList, countTotalList)
val distinctIp: Array[String] = mergeDistinctIp(distCipRecent) val distinctIp: Array[String] = mergeDistinctIp(distCipRecent)
val key = fqdn.concat("-" + serverIp) val key = fqdn.concat("-" + serverIp)
var document = dictionaryMap.getOrDefault(key, null)
if (document != null) { if (fqdnLocIpDoc != null) {
updateMaxAttribute(document, lastFoundTime, "LAST_FOUND_TIME") updateMaxAttribute(fqdnLocIpDoc, lastFoundTime, "LAST_FOUND_TIME")
updateProtocolAttritube(document, sepAttritubeMap) updateProtocolAttritube(fqdnLocIpDoc, sepAttritubeMap)
updateDistinctIp(document, distinctIp) updateDistinctIp(fqdnLocIpDoc, distinctIp)
} else { } else {
document = new BaseEdgeDocument() fqdnLocIpDoc = new BaseEdgeDocument()
document.setKey(key) fqdnLocIpDoc.setKey(key)
document.setFrom("FQDN/" + fqdn) fqdnLocIpDoc.setFrom("FQDN/" + fqdn)
document.setTo("IP/" + serverIp) fqdnLocIpDoc.setTo("IP/" + serverIp)
document.addAttribute("FIRST_FOUND_TIME", firstFoundTime) fqdnLocIpDoc.addAttribute("FIRST_FOUND_TIME", firstFoundTime)
document.addAttribute("LAST_FOUND_TIME", lastFoundTime) fqdnLocIpDoc.addAttribute("LAST_FOUND_TIME", lastFoundTime)
putProtocolAttritube(document, sepAttritubeMap) putProtocolAttritube(fqdnLocIpDoc, sepAttritubeMap)
putDistinctIp(document, distinctIp) putDistinctIp(fqdnLocIpDoc, distinctIp)
} }
document }
fqdnLocIpDoc
} }
} }

View File

@@ -24,6 +24,7 @@ package cn.ac.iie.spark
import cn.ac.iie.spark.rdd.{ArangoRdd, ReadOptions, WriteOptions} import cn.ac.iie.spark.rdd.{ArangoRdd, ReadOptions, WriteOptions}
import cn.ac.iie.spark.vpack.VPackUtils import cn.ac.iie.spark.vpack.VPackUtils
import com.arangodb.model.DocumentCreateOptions
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.{DataFrame, Dataset, Row}
@@ -76,7 +77,6 @@ object ArangoSpark {
* *
* @param dataframe the dataframe with data to save * @param dataframe the dataframe with data to save
* @param collection the collection to save in * @param collection the collection to save in
* @param options additional write options
*/ */
def saveDF(dataframe: DataFrame, collection: String): Unit = def saveDF(dataframe: DataFrame, collection: String): Unit =
saveRDD[Row](dataframe.rdd, collection, WriteOptions(), (x: Iterator[Row]) => x.map { y => VPackUtils.rowToVPack(y) }) saveRDD[Row](dataframe.rdd, collection, WriteOptions(), (x: Iterator[Row]) => x.map { y => VPackUtils.rowToVPack(y) })
@@ -102,6 +102,11 @@ object ArangoSpark {
case WriteOptions.INSERT => col.insertDocuments(docs) case WriteOptions.INSERT => col.insertDocuments(docs)
case WriteOptions.UPDATE => col.updateDocuments(docs) case WriteOptions.UPDATE => col.updateDocuments(docs)
case WriteOptions.REPLACE => col.replaceDocuments(docs) case WriteOptions.REPLACE => col.replaceDocuments(docs)
case WriteOptions.OVERWRITE =>
val documentCreateOptions = new DocumentCreateOptions
documentCreateOptions.overwrite(true)
documentCreateOptions.silent(true)
col.insertDocuments(docs, documentCreateOptions)
} }
arangoDB.shutdown() arangoDB.shutdown()
@@ -123,7 +128,7 @@ object ArangoSpark {
* *
* @param sparkContext the sparkContext containing the ArangoDB configuration * @param sparkContext the sparkContext containing the ArangoDB configuration
* @param collection the collection to load data from * @param collection the collection to load data from
* @param additional read options * @param options read options
*/ */
def load[T: ClassTag](sparkContext: SparkContext, collection: String, options: ReadOptions): ArangoRdd[T] = def load[T: ClassTag](sparkContext: SparkContext, collection: String, options: ReadOptions): ArangoRdd[T] =
new ArangoRdd[T](sparkContext, createReadOptions(options, sparkContext.getConf).copy(collection = collection)) new ArangoRdd[T](sparkContext, createReadOptions(options, sparkContext.getConf).copy(collection = collection))

View File

@@ -56,7 +56,7 @@ class ArangoRdd[T: ClassTag](@transient override val sparkContext: SparkContext,
override def repartition(numPartitions: Int)(implicit ord: Ordering[T]): RDD[T] = super.repartition(numPartitions) override def repartition(numPartitions: Int)(implicit ord: Ordering[T]): RDD[T] = super.repartition(numPartitions)
private def getPartition(idx: Int, countTotal: Long): QueryArangoPartition = { private def getPartition(idx: Int, countTotal: Long): QueryArangoPartition = {
val sepNum = countTotal / ApplicationConfig.THREAD_POOL_NUMBER + 1 val sepNum = countTotal / ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS + 1
val offsetNum = idx * sepNum val offsetNum = idx * sepNum
new QueryArangoPartition(idx, offsetNum, sepNum) new QueryArangoPartition(idx, offsetNum, sepNum)
} }

View File

@@ -116,4 +116,9 @@ object WriteOptions {
*/ */
case object REPLACE extends Method case object REPLACE extends Method
/**
* save documents by overwrite
*/
case object OVERWRITE extends Method
} }

View File

@@ -1,7 +1,9 @@
package cn.ac.iie.utils package cn.ac.iie.utils
import cn.ac.iie.config.ApplicationConfig import cn.ac.iie.config.ApplicationConfig
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession import org.apache.spark.sql.SparkSession
import org.apache.spark.util.LongAccumulator
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
object SparkSessionUtil { object SparkSessionUtil {
@@ -9,6 +11,8 @@ object SparkSessionUtil {
val spark: SparkSession = getSparkSession val spark: SparkSession = getSparkSession
var sparkContext: SparkContext = getContext
private def getSparkSession: SparkSession ={ private def getSparkSession: SparkSession ={
val spark: SparkSession = SparkSession val spark: SparkSession = SparkSession
.builder() .builder()
@@ -26,10 +30,27 @@ object SparkSessionUtil {
spark spark
} }
def getContext: SparkContext = {
@transient var sc: SparkContext = null
if (sparkContext == null) sc = spark.sparkContext
sc
}
def getLongAccumulator(name: String): LongAccumulator ={
if (sparkContext == null){
sparkContext = getContext
}
sparkContext.longAccumulator(name)
}
def closeSpark(): Unit ={ def closeSpark(): Unit ={
if (spark != null){ if (spark != null){
spark.stop() spark.stop()
} }
if (sparkContext != null){
sparkContext.stop()
}
} }
} }

View File

@@ -7,7 +7,7 @@ import org.apache.spark.sql.SparkSession
object BaseClickhouseDataTest { object BaseClickhouseDataTest {
private val spark: SparkSession = SparkSessionUtil.spark private val spark: SparkSession = SparkSessionUtil.spark
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
BaseClickhouseData loadConnectionDataFromCk() // BaseClickhouseData loadConnectionDataFromCk()
val sql = val sql =
""" """
|SELECT |SELECT

View File

@@ -1,35 +0,0 @@
package cn.ac.iie.service.update
import java.util
import java.util.ArrayList
import java.util.concurrent.ConcurrentHashMap
import cn.ac.iie.dao.BaseArangoData
import cn.ac.iie.dao.BaseArangoData._
import com.arangodb.entity.{BaseDocument, BaseEdgeDocument}
import scala.collection.mutable.WrappedArray.ofRef
object UpdateDocumentTest {
def main(args: Array[String]): Unit = {
val baseArangoData = new BaseArangoData()
baseArangoData.readHistoryData("R_LOCATE_FQDN2IP", historyRelationFqdnAddressIpMap, classOf[BaseEdgeDocument])
val value = BaseArangoData.historyRelationFqdnAddressIpMap.keys()
while (value.hasMoreElements) {
val integer: Integer = value.nextElement()
val map: ConcurrentHashMap[String, BaseEdgeDocument] = historyRelationFqdnAddressIpMap.get(integer)
val unit = map.keys()
while (unit.hasMoreElements) {
val key = unit.nextElement()
val edgeDocument = map.get(key)
// val longs = edgeDocument.getAttribute("DNS_CNT_RECENT").asInstanceOf[util.ArrayList[Long]]
// val strings = edgeDocument.getAttribute("DIST_CIP").asInstanceOf[util.ArrayList[String]]
val strings = edgeDocument.getAttribute("DIST_CIP").asInstanceOf[Array[String]]
val longs = edgeDocument.getAttribute("DNS_CNT_RECENT").asInstanceOf[Array[java.lang.Long]]
println(longs.toString + "---" + strings.toString)
}
}
}
}

View File

@@ -1,9 +1,14 @@
package cn.ac.iie.spark package cn.ac.iie.spark
import cn.ac.iie.spark.rdd.ReadOptions import cn.ac.iie.config.ApplicationConfig
import cn.ac.iie.dao.BaseClickhouseData
import cn.ac.iie.spark.partition.CustomPartitioner
import cn.ac.iie.spark.rdd.{ArangoRdd, ReadOptions}
import cn.ac.iie.utils.SparkSessionUtil import cn.ac.iie.utils.SparkSessionUtil
import com.arangodb.entity.BaseDocument import com.arangodb.entity.BaseDocument
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.{collect_list, max, min}
import org.apache.spark.storage.StorageLevel import org.apache.spark.storage.StorageLevel
object RDDTest { object RDDTest {
@@ -14,30 +19,57 @@ object RDDTest {
println(sparkContext.getConf.get("arangodb.hosts")) println(sparkContext.getConf.get("arangodb.hosts"))
// val options = ReadOptions("iplearn_media_domain").copy(collection = "R_LOCATE_FQDN2IP") // val options = ReadOptions("iplearn_media_domain").copy(collection = "R_LOCATE_FQDN2IP")
val options = ReadOptions("ip-learning-test-0") val options = ReadOptions(ApplicationConfig.ARANGODB_DB_NAME)
val ipOptions = options.copy(collection = "IP") val ipOptions = options.copy(collection = "IP")
val rdd = ArangoSpark.load[BaseDocument](sparkContext,"IP",options) val rdd: ArangoRdd[BaseDocument] = ArangoSpark.load[BaseDocument](sparkContext,"IP",options)
println(rdd.count()) println(rdd.count())
println(rdd.getNumPartitions) println(rdd.getNumPartitions)
val ipRDD = mergeVertexIp()
val value: RDD[(String, (Option[BaseDocument], Option[Row]))] = rdd.map(doc => {
(doc.getKey, doc)
}).fullOuterJoin(ipRDD)
value.foreach((row: (String, (Option[BaseDocument], Option[Row]))) => {
val value = row._2._2
val str: String = value match {
case Some(r) => r.getAs[String]("IP")
// case None => null
case _ => null
}
println(str)
})
/*
val value: RDD[BaseDocument] = rdd.filter(doc => doc.getAttribute("CLIENT_SESSION_COUNT").asInstanceOf[Long] > 100).map(doc => { val value: RDD[BaseDocument] = rdd.filter(doc => doc.getAttribute("CLIENT_SESSION_COUNT").asInstanceOf[Long] > 100).map(doc => {
doc.addAttribute("abc", 1) doc.addAttribute("abc", 1)
doc doc
}) })
value.map(doc => {(doc.getKey,doc)}) value.map(doc => {(doc.getKey,doc)})
value.persist(StorageLevel.MEMORY_AND_DISK) value.persist(StorageLevel.MEMORY_AND_DISK)
value.foreach(fqdnRow => println(fqdnRow.toString))
value.foreach(row => println(row.toString))
println(value.count()) println(value.count())
*/
SparkSessionUtil.spark.close() SparkSessionUtil.spark.close()
System.exit(0) System.exit(0)
} }
def mergeVertexIp(): RDD[(String,Row)]={
val vertexIpDf = BaseClickhouseData.getVertexIpDf
val frame = vertexIpDf.groupBy("IP").agg(
min("FIRST_FOUND_TIME").alias("FIRST_FOUND_TIME"),
max("LAST_FOUND_TIME").alias("LAST_FOUND_TIME"),
collect_list("SESSION_COUNT").alias("SESSION_COUNT_LIST"),
collect_list("BYTES_SUM").alias("BYTES_SUM_LIST"),
collect_list("ip_type").alias("ip_type_list")
)
val values = frame.rdd.map(row => (row.getAs[String]("IP"), row))
.partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS))
values
}
} }