根据自定义arangoRDD重构代码

2020-11-10 16:59:39 +08:00
parent db5ca9db08
commit b62131dacd
14 changed files with 631 additions and 310 deletions
--- a/ip-learning-spark/src/main/java/cn/ac/iie/dao/BaseArangoData.java
+++ b/ip-learning-spark/src/main/java/cn/ac/iie/dao/BaseArangoData.java
@@ -6,7 +6,6 @@ import cn.ac.iie.utils.ArangoDBConnect;
 import cn.ac.iie.utils.ExecutorThreadPool;
 import com.arangodb.ArangoCursor;
 import com.arangodb.entity.BaseDocument;
 import com.arangodb.entity.BaseEdgeDocument;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -20,15 +19,6 @@ import java.util.concurrent.CountDownLatch;
 */
 public class BaseArangoData {
    private static final Logger LOG = LoggerFactory.getLogger(BaseArangoData.class);
    public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseDocument>> historyVertexFqdnMap = new ConcurrentHashMap<>();
    public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseDocument>> historyVertexIpMap = new ConcurrentHashMap<>();
    public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseDocument>> historyVertexSubscriberMap = new ConcurrentHashMap<>();
    public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseEdgeDocument>> historyRelationFqdnAddressIpMap = new ConcurrentHashMap<>();
    public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseEdgeDocument>> historyRelationIpVisitFqdnMap = new ConcurrentHashMap<>();
    public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseEdgeDocument>> historyRelationFqdnSameFqdnMap = new ConcurrentHashMap<>();
    public static ConcurrentHashMap<Integer, ConcurrentHashMap<String, BaseEdgeDocument>> historyRelationSubsciberLocateIpMap = new ConcurrentHashMap<>();
    private static ArangoDBConnect arangoDBConnect = ArangoDBConnect.getInstance();
    private ExecutorThreadPool threadPool = ExecutorThreadPool.getInstance();
@@ -82,47 +72,4 @@ public class BaseArangoData {
        return "FOR doc IN " + table + " limit "+offsetNum+","+sepNum+" RETURN doc";
    }
    private long[] getTimeRange(String table) {
        long minTime = 0L;
        long maxTime = 0L;
        long startTime = System.currentTimeMillis();
        String sql = "LET doc = (FOR doc IN " + table + " RETURN doc) return {max_time:MAX(doc[*].FIRST_FOUND_TIME),min_time:MIN(doc[*].FIRST_FOUND_TIME)}";
        switch (ApplicationConfig.ARANGO_TIME_LIMIT_TYPE()) {
            case 0:
                ArangoCursor<BaseDocument> timeDoc = arangoDBConnect.executorQuery(sql, BaseDocument.class);
                try {
                    if (timeDoc != null) {
                        while (timeDoc.hasNext()) {
                            BaseDocument doc = timeDoc.next();
                            maxTime = Long.parseLong(doc.getAttribute("max_time").toString()) + ApplicationConfig.THREAD_POOL_NUMBER();
                            minTime = Long.parseLong(doc.getAttribute("min_time").toString());
                        }
                    } else {
                        LOG.warn("获取ArangoDb时间范围为空");
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }
                break;
            case 1:
                maxTime = ApplicationConfig.READ_ARANGO_MAX_TIME();
                minTime = ApplicationConfig.READ_ARANGO_MIN_TIME();
                break;
            default:
        }
        long lastTime = System.currentTimeMillis();
        LOG.warn(sql + "\n查询最大最小时间用时：" + (lastTime - startTime));
        return new long[]{minTime, maxTime};
    }
    private String getQuerySql(long[] timeRange, int threadNumber, String table) {
        long minTime = timeRange[0];
        long maxTime = timeRange[1];
        long diffTime = (maxTime - minTime) / ApplicationConfig.THREAD_POOL_NUMBER();
        long maxThreadTime = minTime + (threadNumber + 1) * diffTime;
        long minThreadTime = minTime + threadNumber * diffTime;
        return "FOR doc IN " + table + " filter doc.FIRST_FOUND_TIME >= " + minThreadTime + " and doc.FIRST_FOUND_TIME <= " + maxThreadTime + " " + ApplicationConfig.ARANGODB_READ_LIMIT() + " RETURN doc";
    }
 }
--- a/ip-learning-spark/src/main/resources/application.properties
+++ b/ip-learning-spark/src/main/resources/application.properties
@@ -1,45 +1,39 @@
 #spark任务配置
-spark.sql.shuffle.partitions=5
+spark.sql.shuffle.partitions=10
 spark.executor.memory=4g
 spark.app.name=test
 spark.network.timeout=300s
 repartitionNumber=36
 spark.serializer=org.apache.spark.serializer.KryoSerializer
 master=local[*]
 #spark读取clickhouse配置
-spark.read.clickhouse.url=jdbc:clickhouse://192.168.40.186:8123/tsg_galaxy_v3
+spark.read.clickhouse.url=jdbc:clickhouse://192.168.44.67:8123/tsg_galaxy_v3
 spark.read.clickhouse.driver=ru.yandex.clickhouse.ClickHouseDriver
 spark.read.clickhouse.user=default
-spark.read.clickhouse.password=111111
+spark.read.clickhouse.password=ceiec2019
-spark.read.clickhouse.numPartitions=144
+spark.read.clickhouse.numPartitions=5
 spark.read.clickhouse.fetchsize=10000
-spark.read.clickhouse.partitionColumn=common_recv_time
+spark.read.clickhouse.partitionColumn=LAST_FOUND_TIME
 clickhouse.socket.timeout=300000
 #arangoDB配置
 arangoDB.host=192.168.40.182
 arangoDB.port=8529
 arangoDB.user=upsert
 arangoDB.password=ceiec2018
-#arangoDB.DB.name=insert_iplearn_index
+arangoDB.DB.name=ip-learning-test-0
-arangoDB.DB.name=iplearn_media_domain
+#arangoDB.DB.name=iplearn_media_domain
 arangoDB.ttl=3600
-thread.pool.number=5
+thread.pool.number=10
 #读取clickhouse时间范围方式，0：读取过去一小时；1：指定时间范围
-clickhouse.time.limit.type=0
+clickhouse.time.limit.type=1
-read.clickhouse.max.time=1571245220
+read.clickhouse.max.time=1603785961
-read.clickhouse.min.time=1571245210
+read.clickhouse.min.time=1603354682
-#读取arangoDB时间范围方式，0：正常读；1：指定时间范围
+arangoDB.read.limit=1
 arango.time.limit.type=0
 read.arango.max.time=1571245320
 read.arango.min.time=1571245200
 arangoDB.read.limit=
 update.arango.batch=10000
 distinct.client.ip.num=10000
 recent.count.hour=24
-update.interval=10800
+update.interval=3600
--- a/ip-learning-spark/src/main/scala/cn/ac/iie/config/ApplicationConfig.scala
+++ b/ip-learning-spark/src/main/scala/cn/ac/iie/config/ApplicationConfig.scala
@@ -36,12 +36,7 @@ object ApplicationConfig {
  val READ_CLICKHOUSE_MAX_TIME: Long = config.getLong("read.clickhouse.max.time")
  val READ_CLICKHOUSE_MIN_TIME: Long = config.getLong("read.clickhouse.min.time")
-  val ARANGO_TIME_LIMIT_TYPE: Int = config.getInt("arango.time.limit.type")
+  val ARANGODB_READ_LIMIT: Int = config.getInt("arangoDB.read.limit")
  val READ_ARANGO_MAX_TIME: Long = config.getLong("read.arango.max.time")
  val READ_ARANGO_MIN_TIME: Long = config.getLong("read.arango.min.time")
  val ARANGODB_READ_LIMIT: String = config.getString("arangoDB.read.limit")
  val UPDATE_ARANGO_BATCH: Int = config.getInt("update.arango.batch")
  val RECENT_COUNT_HOUR: Int = config.getInt("recent.count.hour")
  val DISTINCT_CLIENT_IP_NUM: Int = config.getInt("distinct.client.ip.num")
--- a/ip-learning-spark/src/main/scala/cn/ac/iie/dao/BaseClickhouseData.scala
+++ b/ip-learning-spark/src/main/scala/cn/ac/iie/dao/BaseClickhouseData.scala
@@ -11,7 +11,7 @@ object BaseClickhouseData {
  val currentHour: Long = System.currentTimeMillis / (60 * 60 * 1000) * 60 * 60
  private val timeLimit: (Long, Long) = getTimeLimit
-  private def initClickhouseData(sql:String): Unit ={
+  private def initClickhouseData(sql:String): DataFrame  ={
    val dataFrame: DataFrame = spark.read.format("jdbc")
      .option("url", ApplicationConfig.SPARK_READ_CLICKHOUSE_URL)
@@ -28,6 +28,8 @@ object BaseClickhouseData {
      .load()
    dataFrame.printSchema()
    dataFrame.createOrReplaceGlobalTempView("dbtable")
    dataFrame
  }
  def loadConnectionDataFromCk(): Unit ={
@@ -68,41 +70,7 @@ object BaseClickhouseData {
    initClickhouseData(sql)
  }
-  def getVertexFqdnDf: DataFrame ={
+  /*
    loadConnectionDataFromCk()
    val sql =
      """
        |SELECT
        | FQDN,MAX( LAST_FOUND_TIME ) AS LAST_FOUND_TIME,MIN( FIRST_FOUND_TIME ) AS FIRST_FOUND_TIME
        |FROM
        | (
        |  (SELECT
        |    ssl_sni AS FQDN,MAX( common_recv_time ) AS LAST_FOUND_TIME,MIN( common_recv_time ) AS FIRST_FOUND_TIME
        |   FROM
        |    global_temp.dbtable
        |   WHERE
        |    common_schema_type = 'SSL' GROUP BY ssl_sni
        |  )
        |  UNION ALL
        |  (SELECT
        |    http_host AS FQDN,MAX( common_recv_time ) AS LAST_FOUND_TIME,MIN( common_recv_time ) AS FIRST_FOUND_TIME
        |   FROM
        |    global_temp.dbtable
        |   WHERE
        |    common_schema_type = 'HTTP' GROUP BY http_host
        |  )
        | )
        |GROUP BY
        | FQDN
        |HAVING
        | FQDN != ''
      """.stripMargin
    LOG.warn(sql)
    val vertexFqdnDf = spark.sql(sql)
    vertexFqdnDf.printSchema()
    vertexFqdnDf
  }
  def getVertexIpDf: DataFrame ={
    loadConnectionDataFromCk()
    val sql =
@@ -190,6 +158,149 @@ object BaseClickhouseData {
    relationFqdnLocateIpDf.printSchema()
    relationFqdnLocateIpDf
  }
  */
  def getVertexFqdnDf: DataFrame ={
    val sql =
      """
        |(SELECT
        | FQDN,MAX( LAST_FOUND_TIME ) AS LAST_FOUND_TIME,MIN( FIRST_FOUND_TIME ) AS FIRST_FOUND_TIME
        |FROM
        | ((SELECT
        |    ssl_sni AS FQDN,MAX( common_recv_time ) AS LAST_FOUND_TIME,MIN( common_recv_time ) AS FIRST_FOUND_TIME
        |   FROM tsg_galaxy_v3.connection_record_log
        |   WHERE common_schema_type = 'SSL' GROUP BY ssl_sni
        |  )UNION ALL
        |  (SELECT
        |    http_host AS FQDN,MAX( common_recv_time ) AS LAST_FOUND_TIME,MIN( common_recv_time ) AS FIRST_FOUND_TIME
        |   FROM tsg_galaxy_v3.connection_record_log
        |   WHERE common_schema_type = 'HTTP' GROUP BY http_host))
        |GROUP BY FQDN HAVING FQDN != '') as dbtable
      """.stripMargin
    LOG.warn(sql)
    val frame = initClickhouseData(sql)
    frame.printSchema()
    frame
  }
  def getVertexIpDf: DataFrame ={
    val where = "common_recv_time >= " + timeLimit._2 + " AND common_recv_time < " + timeLimit._1
    val sql =
      s"""
         |(SELECT * FROM
         |((SELECT common_client_ip AS IP,MIN(common_recv_time) AS FIRST_FOUND_TIME,
         |MAX(common_recv_time) AS LAST_FOUND_TIME,
         |count(*) as SESSION_COUNT,
         |SUM(common_c2s_byte_num+common_s2c_byte_num) as BYTES_SUM,
         |groupUniqArray(2)(common_link_info_c2s)[2] as common_link_info,
         |'client' as ip_type
         |FROM tsg_galaxy_v3.connection_record_log
         |where $where
         |group by common_client_ip)
         |UNION ALL
         |(SELECT common_server_ip AS IP,
         |MIN(common_recv_time) AS FIRST_FOUND_TIME,
         |MAX(common_recv_time) AS LAST_FOUND_TIME,
         |count(*) as SESSION_COUNT,
         |SUM(common_c2s_byte_num+common_s2c_byte_num) as BYTES_SUM,
         |groupUniqArray(2)(common_link_info_s2c)[2] as common_link_info,
         |'server' as ip_type
         |FROM tsg_galaxy_v3.connection_record_log
         |where $where
         |group by common_server_ip))) as dbtable
      """.stripMargin
    LOG.warn(sql)
    val frame = initClickhouseData(sql)
    frame.printSchema()
    frame
  }
  def getRelationFqdnLocateIpDf: DataFrame ={
    val where = "common_recv_time >= " + timeLimit._2 + " AND common_recv_time < " + timeLimit._1
    val sql =
      s"""
         |(SELECT * FROM
         |((SELECT ssl_sni AS FQDN,common_server_ip,MAX(common_recv_time) AS LAST_FOUND_TIME,MIN(common_recv_time) AS FIRST_FOUND_TIME,COUNT(*) AS COUNT_TOTAL,
         |toString(groupUniqArray(${ApplicationConfig.DISTINCT_CLIENT_IP_NUM})(common_client_ip)) AS DIST_CIP_RECENT,'TLS' AS  schema_type
         |FROM tsg_galaxy_v3.connection_record_log
         |WHERE $where and common_schema_type = 'SSL' GROUP BY ssl_sni,common_server_ip)
         |UNION ALL
         |(SELECT http_host AS FQDN,common_server_ip,MAX(common_recv_time) AS LAST_FOUND_TIME,MIN(common_recv_time) AS FIRST_FOUND_TIME,COUNT(*) AS COUNT_TOTAL,
         |toString(groupUniqArray(${ApplicationConfig.DISTINCT_CLIENT_IP_NUM})(common_client_ip)) AS DIST_CIP_RECENT,'HTTP' AS schema_type
         |FROM tsg_galaxy_v3.connection_record_log
         |WHERE $where and common_schema_type = 'HTTP' GROUP BY http_host,common_server_ip))
         |WHERE FQDN != '') as dbtable
      """.stripMargin
    LOG.warn(sql)
    val frame = initClickhouseData(sql)
    frame.printSchema()
    frame
  }
  def getRelationSubidLocateIpDf: DataFrame ={
    val where =
      s"""
         | common_recv_time >= ${timeLimit._2}
         | AND common_recv_time < ${timeLimit._1}
         | AND common_subscriber_id != ''
         | AND radius_framed_ip != ''
      """.stripMargin
    val sql =
      s"""
         |(
         |SELECT common_subscriber_id,radius_framed_ip,MAX(common_recv_time) as LAST_FOUND_TIME,MIN(common_recv_time) as FIRST_FOUND_TIME
         |FROM radius_record_log
         |WHERE $where GROUP BY common_subscriber_id,radius_framed_ip
         |) as dbtable
      """.stripMargin
    LOG.warn(sql)
    val frame = initClickhouseData(sql)
    frame.printSchema()
    frame
  }
  def getVertexSubidDf: DataFrame ={
    val where =
      s"""
         | common_recv_time >= ${timeLimit._2}
         | AND common_recv_time < ${timeLimit._1}
         | AND common_subscriber_id != ''
         | AND radius_framed_ip != ''
      """.stripMargin
    val sql =
      s"""
         |(
         |SELECT common_subscriber_id,MAX(common_recv_time) as LAST_FOUND_TIME,MIN(common_recv_time) as FIRST_FOUND_TIME FROM radius_record_log
         |WHERE $where GROUP BY common_subscriber_id
         |)as dbtable
      """.stripMargin
    LOG.warn(sql)
    val frame = initClickhouseData(sql)
    frame.printSchema()
    frame
  }
  def getVertexFramedIpDf: DataFrame ={
    val where =
      s"""
         | common_recv_time >= ${timeLimit._2}
         | AND common_recv_time < ${timeLimit._1}
         | AND common_subscriber_id != ''
         | AND radius_framed_ip != ''
      """.stripMargin
    val sql =
      s"""
         |(
         |SELECT DISTINCT radius_framed_ip,common_recv_time as LAST_FOUND_TIME FROM radius_record_log WHERE $where
         |)as dbtable
      """.stripMargin
    LOG.warn(sql)
    val frame = initClickhouseData(sql)
    frame.printSchema()
    frame
  }
  private def getTimeLimit: (Long,Long) ={
    var maxTime = 0L
--- a/ip-learning-spark/src/main/scala/cn/ac/iie/service/transform/MergeDataFrame.scala
+++ b/ip-learning-spark/src/main/scala/cn/ac/iie/service/transform/MergeDataFrame.scala
@@ -4,37 +4,57 @@ import java.util.regex.Pattern
 import cn.ac.iie.config.ApplicationConfig
 import cn.ac.iie.dao.BaseClickhouseData
 import cn.ac.iie.spark.ArangoSpark
 import cn.ac.iie.spark.partition.CustomPartitioner
 import cn.ac.iie.spark.rdd.ReadOptions
 import com.arangodb.entity.{BaseDocument, BaseEdgeDocument}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.functions._
 import org.slf4j.LoggerFactory
 import cn.ac.iie.utils.SparkSessionUtil._
 object MergeDataFrame {
  private val LOG = LoggerFactory.getLogger(MergeDataFrame.getClass)
  private val pattern = Pattern.compile("^[\\d]*$")
  private val options = ReadOptions(ApplicationConfig.ARANGODB_DB_NAME)
-  def mergeVertexFqdn(): RDD[Row] ={
+  def mergeVertexFqdn(): RDD[(String, (Option[BaseDocument], Option[Row]))] ={
-    BaseClickhouseData.getVertexFqdnDf
+    val fqdnAccmu = getLongAccumulator("FQDN Accumulator")
-      .rdd.filter(row => isDomain(row.getAs[String](0))).map(row => (row.get(0),row))
+    val fqdnRddRow = BaseClickhouseData.getVertexFqdnDf
-      .partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)).values
+      .rdd.filter(row => isDomain(row.getAs[String](0))).map(row => {
      fqdnAccmu.add(1)
      (row.getAs[String]("FQDN"), row)
    }).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS))
    fqdnRddRow.cache()
    val fqdnRddDoc = ArangoSpark.load[BaseDocument](sparkContext,"FQDN",options)
    fqdnRddDoc.map(doc => (doc.getKey, doc)).fullOuterJoin(fqdnRddRow)
  }
-  def mergeVertexIp(): RDD[Row]={
+  def mergeVertexIp(): RDD[(String, (Option[BaseDocument], Option[Row]))]={
    val ipAccum = getLongAccumulator("IP Accumulator")
    val vertexIpDf = BaseClickhouseData.getVertexIpDf
    val frame = vertexIpDf.groupBy("IP").agg(
      min("FIRST_FOUND_TIME").alias("FIRST_FOUND_TIME"),
      max("LAST_FOUND_TIME").alias("LAST_FOUND_TIME"),
      collect_list("SESSION_COUNT").alias("SESSION_COUNT_LIST"),
      collect_list("BYTES_SUM").alias("BYTES_SUM_LIST"),
-      collect_list("ip_type").alias("ip_type_list")
+      collect_list("ip_type").alias("ip_type_list"),
      last("common_link_info").alias("common_link_info")
    )
-    val values = frame.rdd.map(row => (row.get(0), row))
+    val ipRddRow = frame.rdd.map(row => {
-      .partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)).values
+      ipAccum.add(1)
-    values
+      (row.getAs[String]("IP"), row)
    }).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS))
    val ipRddDoc = ArangoSpark.load[BaseDocument](sparkContext,"IP",options)
    ipRddDoc.map(doc => (doc.getKey, doc)).fullOuterJoin(ipRddRow)
  }
-  def mergeRelationFqdnLocateIp(): RDD[Row] ={
+  def mergeRelationFqdnLocateIp(): RDD[(String, (Option[BaseEdgeDocument], Option[Row]))] ={
    val fqdnLocIpAccum = getLongAccumulator("R_LOCATE_FQDN2IP Accumulator")
    val frame = BaseClickhouseData.getRelationFqdnLocateIpDf.filter(row => isDomain(row.getAs[String]("FQDN")))
      .groupBy("FQDN", "common_server_ip")
      .agg(
@@ -44,13 +64,61 @@ object MergeDataFrame {
        collect_list("schema_type").alias("schema_type_list"),
        collect_set("DIST_CIP_RECENT").alias("DIST_CIP_RECENT")
      )
-    frame.rdd.map(row => {
+    val fqdnLocIpRddRow = frame.rdd.map(row => {
      val fqdn = row.getAs[String]("FQDN")
      val serverIp = row.getAs[String]("common_server_ip")
      val key = fqdn.concat("-" + serverIp)
      fqdnLocIpAccum.add(1)
      (key, row)
-    }).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)).values
+    }).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS))
    val fqdnLocIpRddDoc = ArangoSpark.load[BaseEdgeDocument](sparkContext,"R_LOCATE_FQDN2IP",options)
    fqdnLocIpRddDoc.map(doc => (doc.getKey, doc)).fullOuterJoin(fqdnLocIpRddRow)
  }
  def mergeRelationSubidLocateIp(): RDD[(String, (Option[BaseEdgeDocument], Option[Row]))] ={
    val subidLocIpAccum = getLongAccumulator("R_LOCATE_SUBSCRIBER2IP Accumulator")
    val subidLocIpRddRow = BaseClickhouseData.getRelationSubidLocateIpDf
      .repartition(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)
      .rdd.map(row => {
      val commonSubscriberId = row.getAs[String]("common_subscriber_id")
      val ip = row.getAs[String]("radius_framed_ip")
      val key = commonSubscriberId.concat("-" + ip)
      subidLocIpAccum.add(1)
      (key, row)
    }).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS))
    val subidLocIpRddDoc = ArangoSpark.load[BaseEdgeDocument](sparkContext,"R_LOCATE_SUBSCRIBER2IP",options)
    subidLocIpRddDoc.map(doc => (doc.getKey, doc)).fullOuterJoin(subidLocIpRddRow)
  }
  def mergeVertexSubid(): RDD[(String, (Option[BaseDocument], Option[Row]))] ={
    val subidAccum = getLongAccumulator("SUBSCRIBER Accumulator")
    val subidRddRow = BaseClickhouseData.getVertexSubidDf
      .repartition(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)
      .rdd.map(row => {
      val commonSubscriberId = row.getAs[String]("common_subscriber_id")
      subidAccum.add(1)
      (commonSubscriberId, row)
    }).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS))
    val subidRddDoc = ArangoSpark.load[BaseDocument](sparkContext,"SUBSCRIBER",options)
    subidRddDoc.map(doc => (doc.getKey, doc)).fullOuterJoin(subidRddRow)
  }
  def mergeVertexFrameIp: RDD[Row] ={
    val framedIpAccum = getLongAccumulator("framed ip Accumulator")
    val values = BaseClickhouseData.getVertexFramedIpDf
      .repartition(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)
      .rdd.map(row => {
      val ip = row.getAs[String]("radius_framed_ip")
      framedIpAccum.add(1)
      (ip, row)
    }).partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS)).values
    values
  }
  private def isDomain(fqdn: String): Boolean = {
@@ -58,14 +126,10 @@ object MergeDataFrame {
      if (fqdn == null || fqdn.length == 0) {
        return false
      }
-      if (fqdn.contains(":")) {
+
-        val s = fqdn.split(":")(0)
+      val fqdnArr = fqdn.split(":")(0).split("\\.")
-        if (s.contains(":")){
+
-          return false
+      if (fqdnArr.length != 4){
        }
      }
      val fqdnArr = fqdn.split("\\.")
      if (fqdnArr.length < 4 || fqdnArr.length > 4){
        return true
      }
      for (f <- fqdnArr) {
@@ -83,6 +147,7 @@ object MergeDataFrame {
        LOG.error("解析域名 " + fqdn + " 失败：\n" + e.toString)
    }
    false
  }
 }
--- a/ip-learning-spark/src/main/scala/cn/ac/iie/service/update/UpdateDocHandler.scala
+++ b/ip-learning-spark/src/main/scala/cn/ac/iie/service/update/UpdateDocHandler.scala
@@ -1,7 +1,8 @@
 package cn.ac.iie.service.update
-import java.lang
+import java.util
 import scala.collection.JavaConversions._
 import cn.ac.iie.config.ApplicationConfig
 import cn.ac.iie.service.read.ReadHistoryArangoData
@@ -14,17 +15,25 @@ object UpdateDocHandler {
  val PROTOCOL_SET: Set[String] = Set("HTTP","TLS","DNS")
  def updateMaxAttribute(hisDoc: BaseDocument,newAttribute:Long,attributeName:String): Unit ={
    if(hisDoc.getProperties.containsKey(attributeName)){
      var hisAttritube = hisDoc.getAttribute(attributeName).toString.toLong
      if (newAttribute > hisAttritube){
        hisAttritube = newAttribute
      }
      hisDoc.addAttribute(attributeName,hisAttritube)
    }
  }
  def updateSumAttribute(hisDoc: BaseDocument,newAttribute:Long,attributeName:String): Unit ={
    if (hisDoc.getProperties.containsKey(attributeName)){
      val hisAttritube = hisDoc.getAttribute(attributeName).toString.toLong
      hisDoc.addAttribute(attributeName,newAttribute+hisAttritube)
    }
  }
  def replaceAttribute(hisDoc: BaseDocument,newAttribute:String,attributeName:String): Unit ={
    hisDoc.addAttribute(attributeName,newAttribute)
  }
  def separateAttributeByIpType(ipTypeList:ofRef[String],
                                sessionCountList:ofRef[AnyRef],
@@ -62,20 +71,22 @@ object UpdateDocHandler {
  }
  def updateProtocolAttritube(hisDoc:BaseEdgeDocument, protocolMap: Map[String, Long]): Unit ={
    if (hisDoc.getProperties.containsKey("PROTOCOL_TYPE")){
      var protocolType = hisDoc.getAttribute("PROTOCOL_TYPE").toString
-    protocolMap.foreach(t => {
+      protocolMap.foreach((t: (String, Long)) => {
        if (t._2 > 0 && !protocolType.contains(t._1)){
          protocolType = protocolType.concat(","+ t._1)
        }
        val cntTotalName = t._1.concat("_CNT_TOTAL")
        val cntRecentName = t._1.concat("_CNT_RECENT")
-      val cntRecent: Array[lang.Long] = hisDoc.getAttribute(cntRecentName).asInstanceOf[Array[java.lang.Long]]
+        val cntRecent = hisDoc.getAttribute(cntRecentName).asInstanceOf[Array[Long]]
        cntRecent.update(0,t._2)
        updateSumAttribute(hisDoc,t._2,cntTotalName)
        hisDoc.addAttribute(cntRecentName,cntRecent)
      })
      hisDoc.addAttribute("PROTOCOL_TYPE",protocolType)
    }
  }
  def putProtocolAttritube(doc:BaseEdgeDocument, protocolMap: Map[String, Long]): Unit ={
    val protocolTypeBuilder = new mutable.StringBuilder()
@@ -93,9 +104,29 @@ object UpdateDocHandler {
    doc.addAttribute("PROTOCOL_TYPE",protocolTypeBuilder.toString().replaceFirst(",",""))
  }
-  def mergeDistinctIp(distCipRecent:ofRef[ofRef[String]]): Array[String] ={
+  def updateProtocolDocument(doc: BaseEdgeDocument): Unit = {
-     distCipRecent.flatten.distinct.take(ApplicationConfig.DISTINCT_CLIENT_IP_NUM).toArray
+    if (doc.getProperties.containsKey("PROTOCOL_TYPE")) {
      for (protocol <- PROTOCOL_SET) {
        val protocolRecent = protocol + "_CNT_RECENT"
        val cntRecent: util.ArrayList[Long] = doc.getAttribute(protocolRecent).asInstanceOf[util.ArrayList[Long]]
        val cntRecentsSrc = cntRecent.toArray().map(_.toString.toLong)
        val cntRecentsDst = new Array[Long](24)
        System.arraycopy(cntRecentsSrc, 0, cntRecentsDst, 1, cntRecentsSrc.length - 1)
        cntRecentsDst(0) = 0L
        doc.addAttribute(protocolRecent, cntRecentsDst)
      }
    }
  }
  def mergeDistinctIp(distCipRecent:ofRef[String]): Array[String] ={
    distCipRecent.flatMap(str => {
      str.replaceAll("\\[","")
        .replaceAll("\\]","")
        .replaceAll("\\'","")
        .split(",")
    }).distinct.take(ApplicationConfig.DISTINCT_CLIENT_IP_NUM).toArray
  }
  def putDistinctIp(doc:BaseEdgeDocument,newDistinctIp:Array[String]): Unit ={
    val map = newDistinctIp.map(ip => {
@@ -106,8 +137,9 @@ object UpdateDocHandler {
  }
  def updateDistinctIp(hisDoc:BaseEdgeDocument,newDistinctIp:Array[String]): Unit ={
-    val hisDistCip = hisDoc.getAttribute("DIST_CIP").asInstanceOf[Array[String]]
+    if (hisDoc.getProperties.containsKey("DIST_CIP") && hisDoc.getProperties.containsKey("DIST_CIP_TS")){
-    val hisDistCipTs = hisDoc.getAttribute("DIST_CIP_TS").asInstanceOf[Array[Long]]
+      val hisDistCip = hisDoc.getAttribute("DIST_CIP").asInstanceOf[util.ArrayList[String]]
      val hisDistCipTs = hisDoc.getAttribute("DIST_CIP_TS").asInstanceOf[util.ArrayList[Long]]
      if (hisDistCip.length == hisDistCipTs.length){
        val distCipToTsMap: Map[String, Long] = hisDistCip.zip(hisDistCipTs).toMap
        val muDistCipToTsMap: mutable.Map[String, Long] = mutable.Map(distCipToTsMap.toSeq:_*)
@@ -119,5 +151,6 @@ object UpdateDocHandler {
        hisDoc.addAttribute("DIST_CIP_TS",resultMap.values.toArray)
      }
    }
  }
 }
--- a/ip-learning-spark/src/main/scala/cn/ac/iie/service/update/UpdateDocument.scala
+++ b/ip-learning-spark/src/main/scala/cn/ac/iie/service/update/UpdateDocument.scala
@@ -1,17 +1,12 @@
 package cn.ac.iie.service.update
 import java.util
 import java.util.concurrent.ConcurrentHashMap
 import cn.ac.iie.config.ApplicationConfig
 import cn.ac.iie.dao.BaseArangoData
 import cn.ac.iie.dao.BaseArangoData._
 import cn.ac.iie.service.transform.MergeDataFrame._
 import cn.ac.iie.service.update.UpdateDocHandler._
-import cn.ac.iie.utils.{ArangoDBConnect, ExecutorThreadPool, SparkSessionUtil}
+import cn.ac.iie.utils.{ArangoDBConnect, SparkSessionUtil}
 import cn.ac.iie.utils.SparkSessionUtil.spark
 import com.arangodb.entity.{BaseDocument, BaseEdgeDocument}
 import org.apache.spark.TaskContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Row
 import org.slf4j.LoggerFactory
@@ -19,44 +14,47 @@ import org.slf4j.LoggerFactory
 import scala.collection.mutable.WrappedArray.ofRef
 object UpdateDocument {
  private val pool = ExecutorThreadPool.getInstance
  private val arangoManger: ArangoDBConnect = ArangoDBConnect.getInstance()
  private val LOG = LoggerFactory.getLogger(UpdateDocument.getClass)
  private val baseArangoData = new BaseArangoData()
  def update(): Unit = {
    try {
-      updateDocument("FQDN", historyVertexFqdnMap, getVertexFqdnRow, classOf[BaseDocument], mergeVertexFqdn)
+      updateDocument("FQDN", getVertexFqdnRow, mergeVertexFqdn)
-      updateDocument("IP", historyVertexIpMap, getVertexIpRow, classOf[BaseDocument], mergeVertexIp)
+
-      updateDocument("R_LOCATE_FQDN2IP", historyRelationFqdnAddressIpMap, getRelationFqdnLocateIpRow, classOf[BaseEdgeDocument], mergeRelationFqdnLocateIp)
+      updateDocument("SUBSCRIBER",getVertexSubidRow,mergeVertexSubid)
      insertFrameIp()
      updateDocument("R_LOCATE_SUBSCRIBER2IP",getRelationSubidLocateIpRow,mergeRelationSubidLocateIp)
      updateDocument("R_LOCATE_FQDN2IP", getRelationFqdnLocateIpRow, mergeRelationFqdnLocateIp)
      updateDocument("IP", getVertexIpRow, mergeVertexIp)
    } catch {
      case e: Exception => e.printStackTrace()
    } finally {
      pool.shutdown()
      arangoManger.clean()
      SparkSessionUtil.closeSpark()
      System.exit(0)
    }
  }
  private def updateDocument[T <: BaseDocument](collName: String,
-                                                historyMap: ConcurrentHashMap[Integer, ConcurrentHashMap[String, T]],
+                                                getDocumentRow: ((String, (Option[T], Option[Row]))) => T,
-                                                getDocumentRow: (Row, ConcurrentHashMap[String, T]) => T,
+                                                getJoinRdd: () => RDD[(String, (Option[T], Option[Row]))]
                                                clazz: Class[T],
                                                getNewDataRdd: () => RDD[Row]
                                               ): Unit = {
    baseArangoData.readHistoryData(collName, historyMap, clazz)
    val hisBc = spark.sparkContext.broadcast(historyMap)
    try {
      val start = System.currentTimeMillis()
-      val newDataRdd = getNewDataRdd()
+      val joinRdd = getJoinRdd()
-      newDataRdd.foreachPartition(iter => {
+      joinRdd.foreachPartition(iter => {
        val partitionId: Int = TaskContext.get.partitionId
        val dictionaryMap: ConcurrentHashMap[String, T] = hisBc.value.get(partitionId)
        val resultDocumentList = new util.ArrayList[T]
        var i = 0
        iter.foreach(row => {
-          val document = getDocumentRow(row, dictionaryMap)
+          val document = getDocumentRow(row)
          if (document != null){
            resultDocumentList.add(document)
          }
          i += 1
          if (i >= ApplicationConfig.UPDATE_ARANGO_BATCH) {
            arangoManger.overwrite(resultDocumentList, collName)
@@ -73,88 +71,238 @@ object UpdateDocument {
      LOG.warn(s"更新$collName 时间：${last - start}")
    } catch {
      case e: Exception => e.printStackTrace()
    } finally {
      hisBc.destroy()
    }
  }
-  private def getVertexFqdnRow(row: Row, dictionaryMap: ConcurrentHashMap[String, BaseDocument]): BaseDocument = {
+  private def insertFrameIp(): Unit ={
-    val fqdn = row.getAs[String]("FQDN")
+    mergeVertexFrameIp.foreachPartition(iter => {
-    val lastFoundTime = row.getAs[Long]("LAST_FOUND_TIME")
+      val resultDocumentList = new util.ArrayList[BaseDocument]
-    val firstFoundTime = row.getAs[Long]("FIRST_FOUND_TIME")
+      var i = 0
-    var document: BaseDocument = dictionaryMap.getOrDefault(fqdn, null)
+      iter.foreach(row => {
-    if (document != null) {
+        val document = getVertexFrameipRow(row)
-      updateMaxAttribute(document, lastFoundTime, "LAST_FOUND_TIME")
+        resultDocumentList.add(document)
-    } else {
+        i += 1
-      document = new BaseDocument
+        if (i >= ApplicationConfig.UPDATE_ARANGO_BATCH) {
-      document.setKey(fqdn)
+          arangoManger.overwrite(resultDocumentList, "IP")
-      document.addAttribute("FQDN_NAME", fqdn)
+          LOG.warn(s"更新:IP" + i)
-      document.addAttribute("FIRST_FOUND_TIME", firstFoundTime)
+          i = 0
      document.addAttribute("LAST_FOUND_TIME", lastFoundTime)
        }
-    document
+      })
      if (i != 0) {
        arangoManger.overwrite(resultDocumentList, "IP")
        LOG.warn(s"更新IP:" + i)
      }
    })
  }
-  private def getVertexIpRow(row: Row, dictionaryMap: ConcurrentHashMap[String, BaseDocument]): BaseDocument = {
+  private def getVertexFrameipRow(row: Row): BaseDocument ={
-    val ip = row.getAs[String]("IP")
+    val ip = row.getAs[String]("radius_framed_ip")
-    val firstFoundTime = row.getAs[Long]("FIRST_FOUND_TIME")
+    val document = new BaseDocument()
    val lastFoundTime = row.getAs[Long]("LAST_FOUND_TIME")
    val sessionCountList = row.getAs[ofRef[AnyRef]]("SESSION_COUNT_LIST")
    val bytesSumList = row.getAs[ofRef[AnyRef]]("BYTES_SUM_LIST")
    val ipTypeList = row.getAs[ofRef[String]]("ip_type_list")
    val sepAttributeTuple = separateAttributeByIpType(ipTypeList, sessionCountList, bytesSumList)
    var document = dictionaryMap.getOrDefault(ip, null)
    if (document != null) {
      updateMaxAttribute(document, lastFoundTime, "LAST_FOUND_TIME")
      updateSumAttribute(document, sepAttributeTuple._1, "SERVER_SESSION_COUNT")
      updateSumAttribute(document, sepAttributeTuple._2, "SERVER_BYTES_SUM")
      updateSumAttribute(document, sepAttributeTuple._3, "CLIENT_SESSION_COUNT")
      updateSumAttribute(document, sepAttributeTuple._4, "CLIENT_BYTES_SUM")
    } else {
      document = new BaseDocument
    document.setKey(ip)
    document.addAttribute("IP",ip)
      document.addAttribute("FIRST_FOUND_TIME", firstFoundTime)
      document.addAttribute("LAST_FOUND_TIME", lastFoundTime)
      document.addAttribute("SERVER_SESSION_COUNT", sepAttributeTuple._1)
      document.addAttribute("SERVER_BYTES_SUM", sepAttributeTuple._2)
      document.addAttribute("CLIENT_SESSION_COUNT", sepAttributeTuple._3)
      document.addAttribute("CLIENT_BYTES_SUM", sepAttributeTuple._4)
      document.addAttribute("COMMON_LINK_INFO", "")
    }
    document
  }
-  private def getRelationFqdnLocateIpRow(row: Row, dictionaryMap: ConcurrentHashMap[String, BaseEdgeDocument]): BaseEdgeDocument = {
+  private def getRelationSubidLocateIpRow(joinRow: (String, (Option[BaseEdgeDocument], Option[Row]))): BaseEdgeDocument ={
-    val fqdn = row.getAs[String]("FQDN")
+
-    val serverIp = row.getAs[String]("common_server_ip")
+    val subidLocIpDocOpt = joinRow._2._1
-    val firstFoundTime = row.getAs[Long]("FIRST_FOUND_TIME")
+    var subidLocIpDoc = subidLocIpDocOpt match {
-    val lastFoundTime = row.getAs[Long]("LAST_FOUND_TIME")
+      case Some(doc) => doc
-    val countTotalList = row.getAs[ofRef[AnyRef]]("COUNT_TOTAL_LIST")
+      case None => null
-    val schemaTypeList = row.getAs[ofRef[AnyRef]]("schema_type_list")
+    }
-    val distCipRecent = row.getAs[ofRef[ofRef[String]]]("DIST_CIP_RECENT")
+
    val subidLocIpRowOpt = joinRow._2._2
    val subidLocIpRow = subidLocIpRowOpt match {
      case Some(r) => r
      case None => null
    }
    if (subidLocIpRow != null){
      val subId = subidLocIpRow.getAs[String]("common_subscriber_id")
      val ip = subidLocIpRow.getAs[String]("radius_framed_ip")
      val lastFoundTime = subidLocIpRow.getAs[Long]("LAST_FOUND_TIME")
      val firstFoundTime = subidLocIpRow.getAs[Long]("FIRST_FOUND_TIME")
      val key = subId.concat("-"+ip)
      if (subidLocIpDoc != null){
        updateMaxAttribute(subidLocIpDoc,lastFoundTime,"LAST_FOUND_TIME")
      } else {
        subidLocIpDoc = new BaseEdgeDocument()
        subidLocIpDoc.setKey(key)
        subidLocIpDoc.setFrom("SUBSCRIBER/" + subId)
        subidLocIpDoc.setTo("IP/" + ip)
        subidLocIpDoc.addAttribute("SUBSCRIBER",subId)
        subidLocIpDoc.addAttribute("IP",ip)
        subidLocIpDoc.addAttribute("FIRST_FOUND_TIME",firstFoundTime)
        subidLocIpDoc.addAttribute("LAST_FOUND_TIME",lastFoundTime)
      }
    }
    subidLocIpDoc
  }
  private def getVertexSubidRow(joinRow: (String, (Option[BaseDocument], Option[Row]))): BaseDocument ={
    val subidDocOpt = joinRow._2._1
    var subidDoc = subidDocOpt match {
      case Some(doc) => doc
      case None => null
    }
    val subidRowOpt = joinRow._2._2
    val subidRow = subidRowOpt match {
      case Some(r) => r
      case None => null
    }
    if (subidRow != null){
      val subId = subidRow.getAs[String]("common_subscriber_id")
      val subLastFoundTime = subidRow.getAs[Long]("LAST_FOUND_TIME")
      val subFirstFoundTime = subidRow.getAs[Long]("FIRST_FOUND_TIME")
      if (subidDoc != null){
        updateMaxAttribute(subidDoc,subLastFoundTime,"LAST_FOUND_TIME")
      } else {
        subidDoc = new BaseDocument()
        subidDoc.setKey(subId)
        subidDoc.addAttribute("SUBSCRIBER",subId)
        subidDoc.addAttribute("FIRST_FOUND_TIME",subFirstFoundTime)
        subidDoc.addAttribute("LAST_FOUND_TIME",subLastFoundTime)
      }
    }
    subidDoc
  }
  private def getVertexFqdnRow(joinRow: (String, (Option[BaseDocument], Option[Row]))): BaseDocument = {
    val fqdnDocOpt = joinRow._2._1
    var fqdnDoc = fqdnDocOpt match {
      case Some(doc) => doc
      case None => null
    }
    val fqdnRowOpt = joinRow._2._2
    val fqdnRow = fqdnRowOpt match {
      case Some(r) => r
      case None => null
    }
    if (fqdnRow != null){
      val fqdn = fqdnRow.getAs[String]("FQDN")
      val lastFoundTime = fqdnRow.getAs[Long]("LAST_FOUND_TIME")
      val firstFoundTime = fqdnRow.getAs[Long]("FIRST_FOUND_TIME")
      if (fqdnDoc != null) {
        updateMaxAttribute(fqdnDoc, lastFoundTime, "LAST_FOUND_TIME")
      } else {
        fqdnDoc = new BaseDocument
        fqdnDoc.setKey(fqdn)
        fqdnDoc.addAttribute("FQDN_NAME", fqdn)
        fqdnDoc.addAttribute("FIRST_FOUND_TIME", firstFoundTime)
        fqdnDoc.addAttribute("LAST_FOUND_TIME", lastFoundTime)
      }
    }
    fqdnDoc
  }
  private def getVertexIpRow(joinRow: (String, (Option[BaseDocument], Option[Row]))): BaseDocument = {
    val ipDocOpt = joinRow._2._1
    var ipDoc = ipDocOpt match {
      case Some(doc) => doc
      case None => null
    }
    val ipRowOpt = joinRow._2._2
    val ipRow = ipRowOpt match {
      case Some(r) => r
      case None => null
    }
    if (ipRow != null){
      val ip = ipRow.getAs[String]("IP")
      val firstFoundTime = ipRow.getAs[Long]("FIRST_FOUND_TIME")
      val lastFoundTime = ipRow.getAs[Long]("LAST_FOUND_TIME")
      val sessionCountList = ipRow.getAs[ofRef[AnyRef]]("SESSION_COUNT_LIST")
      val bytesSumList = ipRow.getAs[ofRef[AnyRef]]("BYTES_SUM_LIST")
      val ipTypeList = ipRow.getAs[ofRef[String]]("ip_type_list")
      val linkInfo = ipRow.getAs[String]("common_link_info")
      val sepAttributeTuple = separateAttributeByIpType(ipTypeList, sessionCountList, bytesSumList)
      if (ipDoc != null) {
        updateMaxAttribute(ipDoc, lastFoundTime, "LAST_FOUND_TIME")
        updateSumAttribute(ipDoc, sepAttributeTuple._1, "SERVER_SESSION_COUNT")
        updateSumAttribute(ipDoc, sepAttributeTuple._2, "SERVER_BYTES_SUM")
        updateSumAttribute(ipDoc, sepAttributeTuple._3, "CLIENT_SESSION_COUNT")
        updateSumAttribute(ipDoc, sepAttributeTuple._4, "CLIENT_BYTES_SUM")
        replaceAttribute(ipDoc,linkInfo,"COMMON_LINK_INFO")
      } else {
        ipDoc = new BaseDocument
        ipDoc.setKey(ip)
        ipDoc.addAttribute("IP", ip)
        ipDoc.addAttribute("FIRST_FOUND_TIME", firstFoundTime)
        ipDoc.addAttribute("LAST_FOUND_TIME", lastFoundTime)
        ipDoc.addAttribute("SERVER_SESSION_COUNT", sepAttributeTuple._1)
        ipDoc.addAttribute("SERVER_BYTES_SUM", sepAttributeTuple._2)
        ipDoc.addAttribute("CLIENT_SESSION_COUNT", sepAttributeTuple._3)
        ipDoc.addAttribute("CLIENT_BYTES_SUM", sepAttributeTuple._4)
        ipDoc.addAttribute("COMMON_LINK_INFO", "")
      }
    }
    ipDoc
  }
  private def getRelationFqdnLocateIpRow(joinRow: (String, (Option[BaseEdgeDocument], Option[Row]))): BaseEdgeDocument = {
    val fqdnLocIpDocOpt = joinRow._2._1
    var fqdnLocIpDoc = fqdnLocIpDocOpt match {
      case Some(doc) => doc
      case None => null
    }
    val fqdnLocIpRowOpt = joinRow._2._2
    val fqdnLocIpRow = fqdnLocIpRowOpt match {
      case Some(r) => r
      case None => null
    }
    if (fqdnLocIpDoc != null){
      updateProtocolDocument(fqdnLocIpDoc)
    }
    if (fqdnLocIpRow != null){
      val fqdn = fqdnLocIpRow.getAs[String]("FQDN")
      val serverIp = fqdnLocIpRow.getAs[String]("common_server_ip")
      val firstFoundTime = fqdnLocIpRow.getAs[Long]("FIRST_FOUND_TIME")
      val lastFoundTime = fqdnLocIpRow.getAs[Long]("LAST_FOUND_TIME")
      val countTotalList = fqdnLocIpRow.getAs[ofRef[AnyRef]]("COUNT_TOTAL_LIST")
      val schemaTypeList = fqdnLocIpRow.getAs[ofRef[AnyRef]]("schema_type_list")
      val distCipRecent = fqdnLocIpRow.getAs[ofRef[String]]("DIST_CIP_RECENT")
      val sepAttritubeMap: Map[String, Long] = separateAttributeByProtocol(schemaTypeList, countTotalList)
      val distinctIp: Array[String] = mergeDistinctIp(distCipRecent)
      val key = fqdn.concat("-" + serverIp)
-    var document = dictionaryMap.getOrDefault(key, null)
+
-    if (document != null) {
+      if (fqdnLocIpDoc != null) {
-      updateMaxAttribute(document, lastFoundTime, "LAST_FOUND_TIME")
+        updateMaxAttribute(fqdnLocIpDoc, lastFoundTime, "LAST_FOUND_TIME")
-      updateProtocolAttritube(document, sepAttritubeMap)
+        updateProtocolAttritube(fqdnLocIpDoc, sepAttritubeMap)
-      updateDistinctIp(document, distinctIp)
+        updateDistinctIp(fqdnLocIpDoc, distinctIp)
      } else {
-      document = new BaseEdgeDocument()
+        fqdnLocIpDoc = new BaseEdgeDocument()
-      document.setKey(key)
+        fqdnLocIpDoc.setKey(key)
-      document.setFrom("FQDN/" + fqdn)
+        fqdnLocIpDoc.setFrom("FQDN/" + fqdn)
-      document.setTo("IP/" + serverIp)
+        fqdnLocIpDoc.setTo("IP/" + serverIp)
-      document.addAttribute("FIRST_FOUND_TIME", firstFoundTime)
+        fqdnLocIpDoc.addAttribute("FIRST_FOUND_TIME", firstFoundTime)
-      document.addAttribute("LAST_FOUND_TIME", lastFoundTime)
+        fqdnLocIpDoc.addAttribute("LAST_FOUND_TIME", lastFoundTime)
-      putProtocolAttritube(document, sepAttritubeMap)
+        putProtocolAttritube(fqdnLocIpDoc, sepAttritubeMap)
-      putDistinctIp(document, distinctIp)
+        putDistinctIp(fqdnLocIpDoc, distinctIp)
      }
-    document
+    }
    fqdnLocIpDoc
  }
 }
--- a/ip-learning-spark/src/main/scala/cn/ac/iie/spark/ArangoSpark.scala
+++ b/ip-learning-spark/src/main/scala/cn/ac/iie/spark/ArangoSpark.scala
@@ -24,6 +24,7 @@ package cn.ac.iie.spark
 import cn.ac.iie.spark.rdd.{ArangoRdd, ReadOptions, WriteOptions}
 import cn.ac.iie.spark.vpack.VPackUtils
 import com.arangodb.model.DocumentCreateOptions
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
@@ -76,7 +77,6 @@ object ArangoSpark {
   *
   * @param dataframe the dataframe with data to save
   * @param collection the collection to save in
   * @param options additional write options
   */
  def saveDF(dataframe: DataFrame, collection: String): Unit =
    saveRDD[Row](dataframe.rdd, collection, WriteOptions(), (x: Iterator[Row]) => x.map { y => VPackUtils.rowToVPack(y) })
@@ -102,6 +102,11 @@ object ArangoSpark {
          case WriteOptions.INSERT  => col.insertDocuments(docs)
          case WriteOptions.UPDATE  => col.updateDocuments(docs)
          case WriteOptions.REPLACE => col.replaceDocuments(docs)
          case WriteOptions.OVERWRITE =>
            val documentCreateOptions = new DocumentCreateOptions
            documentCreateOptions.overwrite(true)
            documentCreateOptions.silent(true)
            col.insertDocuments(docs, documentCreateOptions)
        }
        arangoDB.shutdown()
@@ -123,7 +128,7 @@ object ArangoSpark {
   *
   * @param sparkContext the sparkContext containing the ArangoDB configuration
   * @param collection the collection to load data from
-   * @param additional read options
+   * @param options read options
   */
  def load[T: ClassTag](sparkContext: SparkContext, collection: String, options: ReadOptions): ArangoRdd[T] =
    new ArangoRdd[T](sparkContext, createReadOptions(options, sparkContext.getConf).copy(collection = collection))
--- a/ip-learning-spark/src/main/scala/cn/ac/iie/spark/rdd/ArangoRdd.scala
+++ b/ip-learning-spark/src/main/scala/cn/ac/iie/spark/rdd/ArangoRdd.scala
@@ -56,7 +56,7 @@ class ArangoRdd[T: ClassTag](@transient override val sparkContext: SparkContext,
  override def repartition(numPartitions: Int)(implicit ord: Ordering[T]): RDD[T] = super.repartition(numPartitions)
  private def getPartition(idx: Int, countTotal: Long): QueryArangoPartition = {
-    val sepNum = countTotal / ApplicationConfig.THREAD_POOL_NUMBER + 1
+    val sepNum = countTotal / ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS + 1
    val offsetNum = idx * sepNum
    new QueryArangoPartition(idx, offsetNum, sepNum)
  }
--- a/ip-learning-spark/src/main/scala/cn/ac/iie/spark/rdd/WriteOptions.scala
+++ b/ip-learning-spark/src/main/scala/cn/ac/iie/spark/rdd/WriteOptions.scala
@@ -116,4 +116,9 @@ object WriteOptions {
    */
  case object REPLACE extends Method
  /**
    * save documents by overwrite
    */
  case object OVERWRITE extends Method
 }
--- a/ip-learning-spark/src/main/scala/cn/ac/iie/utils/SparkSessionUtil.scala
+++ b/ip-learning-spark/src/main/scala/cn/ac/iie/utils/SparkSessionUtil.scala
@@ -1,7 +1,9 @@
 package cn.ac.iie.utils
 import cn.ac.iie.config.ApplicationConfig
 import org.apache.spark.SparkContext
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.util.LongAccumulator
 import org.slf4j.LoggerFactory
 object SparkSessionUtil {
@@ -9,6 +11,8 @@ object SparkSessionUtil {
  val spark: SparkSession = getSparkSession
  var sparkContext: SparkContext = getContext
  private def getSparkSession: SparkSession ={
    val spark: SparkSession = SparkSession
      .builder()
@@ -26,10 +30,27 @@ object SparkSessionUtil {
    spark
  }
  def getContext: SparkContext = {
    @transient var sc: SparkContext = null
      if (sparkContext == null) sc = spark.sparkContext
    sc
  }
  def getLongAccumulator(name: String): LongAccumulator ={
    if (sparkContext == null){
      sparkContext = getContext
    }
    sparkContext.longAccumulator(name)
  }
  def closeSpark(): Unit ={
    if (spark != null){
      spark.stop()
    }
    if (sparkContext != null){
      sparkContext.stop()
    }
  }
 }
--- a/ip-learning-spark/src/test/scala/cn/ac/iie/dao/BaseClickhouseDataTest.scala
+++ b/ip-learning-spark/src/test/scala/cn/ac/iie/dao/BaseClickhouseDataTest.scala
@@ -7,7 +7,7 @@ import org.apache.spark.sql.SparkSession
 object BaseClickhouseDataTest {
  private val spark: SparkSession = SparkSessionUtil.spark
  def main(args: Array[String]): Unit = {
-    BaseClickhouseData loadConnectionDataFromCk()
+//    BaseClickhouseData loadConnectionDataFromCk()
    val sql =
      """
        |SELECT
--- a/ip-learning-spark/src/test/scala/cn/ac/iie/service/update/UpdateDocumentTest.scala
+++ b/ip-learning-spark/src/test/scala/cn/ac/iie/service/update/UpdateDocumentTest.scala
@@ -1,35 +0,0 @@
 package cn.ac.iie.service.update
 import java.util
 import java.util.ArrayList
 import java.util.concurrent.ConcurrentHashMap
 import cn.ac.iie.dao.BaseArangoData
 import cn.ac.iie.dao.BaseArangoData._
 import com.arangodb.entity.{BaseDocument, BaseEdgeDocument}
 import scala.collection.mutable.WrappedArray.ofRef
 object UpdateDocumentTest {
  def main(args: Array[String]): Unit = {
    val baseArangoData = new BaseArangoData()
    baseArangoData.readHistoryData("R_LOCATE_FQDN2IP", historyRelationFqdnAddressIpMap, classOf[BaseEdgeDocument])
    val value = BaseArangoData.historyRelationFqdnAddressIpMap.keys()
    while (value.hasMoreElements) {
      val integer: Integer = value.nextElement()
      val map: ConcurrentHashMap[String, BaseEdgeDocument] = historyRelationFqdnAddressIpMap.get(integer)
      val unit = map.keys()
      while (unit.hasMoreElements) {
        val key = unit.nextElement()
        val edgeDocument = map.get(key)
        //        val longs = edgeDocument.getAttribute("DNS_CNT_RECENT").asInstanceOf[util.ArrayList[Long]]
        //        val strings = edgeDocument.getAttribute("DIST_CIP").asInstanceOf[util.ArrayList[String]]
        val strings = edgeDocument.getAttribute("DIST_CIP").asInstanceOf[Array[String]]
        val longs = edgeDocument.getAttribute("DNS_CNT_RECENT").asInstanceOf[Array[java.lang.Long]]
        println(longs.toString + "---" + strings.toString)
      }
    }
  }
 }
--- a/ip-learning-spark/src/test/scala/cn/ac/iie/spark/RDDTest.scala
+++ b/ip-learning-spark/src/test/scala/cn/ac/iie/spark/RDDTest.scala
@@ -1,9 +1,14 @@
 package cn.ac.iie.spark
-import cn.ac.iie.spark.rdd.ReadOptions
+import cn.ac.iie.config.ApplicationConfig
 import cn.ac.iie.dao.BaseClickhouseData
 import cn.ac.iie.spark.partition.CustomPartitioner
 import cn.ac.iie.spark.rdd.{ArangoRdd, ReadOptions}
 import cn.ac.iie.utils.SparkSessionUtil
 import com.arangodb.entity.BaseDocument
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.functions.{collect_list, max, min}
 import org.apache.spark.storage.StorageLevel
 object RDDTest {
@@ -14,30 +19,57 @@ object RDDTest {
    println(sparkContext.getConf.get("arangodb.hosts"))
    //    val options = ReadOptions("iplearn_media_domain").copy(collection = "R_LOCATE_FQDN2IP")
-    val options = ReadOptions("ip-learning-test-0")
+    val options = ReadOptions(ApplicationConfig.ARANGODB_DB_NAME)
    val ipOptions = options.copy(collection = "IP")
-    val rdd = ArangoSpark.load[BaseDocument](sparkContext,"IP",options)
+    val rdd: ArangoRdd[BaseDocument] = ArangoSpark.load[BaseDocument](sparkContext,"IP",options)
    println(rdd.count())
    println(rdd.getNumPartitions)
    val ipRDD = mergeVertexIp()
    val value: RDD[(String, (Option[BaseDocument], Option[Row]))] = rdd.map(doc => {
      (doc.getKey, doc)
    }).fullOuterJoin(ipRDD)
    value.foreach((row: (String, (Option[BaseDocument], Option[Row]))) => {
      val value = row._2._2
      val str: String = value match {
        case Some(r) => r.getAs[String]("IP")
 //        case None => null
        case _ => null
      }
      println(str)
    })
    /*
    val value: RDD[BaseDocument] = rdd.filter(doc => doc.getAttribute("CLIENT_SESSION_COUNT").asInstanceOf[Long] > 100).map(doc => {
      doc.addAttribute("abc", 1)
      doc
    })
    value.map(doc => {(doc.getKey,doc)})
    value.persist(StorageLevel.MEMORY_AND_DISK)
-
+    value.foreach(fqdnRow => println(fqdnRow.toString))
    value.foreach(row => println(row.toString))
    println(value.count())
    */
    SparkSessionUtil.spark.close()
    System.exit(0)
  }
  def mergeVertexIp(): RDD[(String,Row)]={
    val vertexIpDf = BaseClickhouseData.getVertexIpDf
    val frame = vertexIpDf.groupBy("IP").agg(
      min("FIRST_FOUND_TIME").alias("FIRST_FOUND_TIME"),
      max("LAST_FOUND_TIME").alias("LAST_FOUND_TIME"),
      collect_list("SESSION_COUNT").alias("SESSION_COUNT_LIST"),
      collect_list("BYTES_SUM").alias("BYTES_SUM_LIST"),
      collect_list("ip_type").alias("ip_type_list")
    )
    val values = frame.rdd.map(row => (row.getAs[String]("IP"), row))
      .partitionBy(new CustomPartitioner(ApplicationConfig.SPARK_SQL_SHUFFLE_PARTITIONS))
    values
  }
 }