This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
wanglihui-ip-learning-graph/ip-learning/src/main/scala/cn/ac/iie/test/TestIndices.scala
2020-06-28 18:27:48 +08:00

220 lines
7.9 KiB
Scala

package cn.ac.iie.test
import com.arangodb.entity.{BaseDocument, BaseEdgeDocument}
import com.arangodb.util.MapBuilder
import com.arangodb.{ArangoCursor, ArangoDB}
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.util.Try
object TestIndices {
@transient
var arangoDB: ArangoDB = _
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName("test")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.network.timeout", "300s")
.config("spark.sql.shuffle.partitions", Config.SPARK_SQL_SHUFFLE_PARTITIONS)
.config("spark.executor.memory", Config.SPARK_EXECUTOR_MEMORY)
.master(Config.MASTER)
.getOrCreate()
val mediaDataFrame: DataFrame = spark.read.format("jdbc")
.option("url", "jdbc:clickhouse://192.168.40.193:8123")
.option("dbtable", s"(select media_domain,recv_time,s1_s_ip,s1_d_ip,s1_s_location_region,s1_d_location_region from av_miner.media_expire_patch where recv_time>=${Config.MINTIME} and recv_time<=${Config.MAXTIME})")
.option("driver", "ru.yandex.clickhouse.ClickHouseDriver")
.option("user", "default")
.option("password", "111111")
.option("numPartitions", Config.NUMPARTITIONS)
.option("partitionColumn", "recv_time")
.option("lowerBound", Config.MINTIME)
.option("upperBound", Config.MAXTIME)
.option("fetchsize", Config.SPARK_SQL_READ_FETCHSIZE)
.load()
mediaDataFrame.printSchema()
mediaDataFrame.createOrReplaceGlobalTempView("media_expire_patch")
val v_FQDN_DF = spark.sql(
"""
|SELECT
| media_domain AS FQDN_NAME,
| MIN( recv_time ) AS FQDN_FIRST_FOUND_TIME,
| MAX( recv_time ) AS FQDN_LAST_FOUND_TIME,
| COUNT( * ) AS FQDN_COUNT_TOTAL
|FROM
| global_temp.media_expire_patch
|WHERE
| media_domain != ''
|GROUP BY
| media_domain
""".stripMargin
)
val time1 = System.currentTimeMillis()
arangoDB = new ArangoDB.Builder()
.maxConnections(Config.MAXPOOLSIZE)
.host("192.168.40.127", 8529)
.user("root")
.password("111111")
.build
val dbName = "insert_iplearn_index"
val collectionName = "V_FQDN"
val query = "FOR doc IN " + collectionName + " RETURN doc"
val bindVars = new MapBuilder().get
val cursor: ArangoCursor[BaseEdgeDocument] = arangoDB.db(dbName).query(query, bindVars, null, classOf[BaseEdgeDocument])
var cursor_Map = scala.collection.mutable.HashMap[String,BaseEdgeDocument]()
while (cursor.hasNext){
val document = cursor.next()
cursor_Map += (document.getKey -> document)
}
val time2 = System.currentTimeMillis()
println((time2 - time1)/1000)
val docs_Insert = new java.util.ArrayList[BaseDocument]()
val docs_Update = new java.util.ArrayList[BaseDocument]()
v_FQDN_DF.foreach(row => {
val fqdn = row.getAs[String]("FQDN_NAME")
val v_Fqdn_First = row.getAs[Long]("FQDN_FIRST_FOUND_TIME")
val v_Fqdn_Last = row.getAs[Long]("FQDN_LAST_FOUND_TIME")
val v_Fqdn_Cnt = row.getAs[Long]("FQDN_COUNT_TOTAL")
val doc = cursor_Map.getOrElse(fqdn, null)
if (doc != null) {
val document: BaseDocument = doc
val fqdn_Cnt = Try(document.getAttribute("FQDN_COUNT_TOTAL")).getOrElse(0).toString.toInt
document.addAttribute("FQDN_COUNT_TOTAL", fqdn_Cnt)
document.addAttribute("FQDN_LAST_FOUND_TIME", v_Fqdn_Last)
docs_Update.add(document)
} else {
val baseDocument: BaseDocument = new BaseDocument()
baseDocument.setKey(fqdn)
baseDocument.addAttribute("FQDN_NAME", fqdn)
baseDocument.addAttribute("FQDN_FIRST_FOUND_TIME", v_Fqdn_First)
baseDocument.addAttribute("FQDN_LAST_FOUND_TIME", v_Fqdn_Last)
baseDocument.addAttribute("FQDN_COUNT_TOTAL", v_Fqdn_Cnt)
docs_Insert.add(baseDocument)
}
})
// Try(v_FQDN_Coll.importDocuments(docs_Insert))
// Try(v_FQDN_Coll.updateDocuments(docs_Update))
/*
val db = arangoDB.db("insert_iplearn_index")
val coll = db.collection("E_ADDRESS_V_FQDN_TO_V_IP")
val docs = new java.util.ArrayList[BaseEdgeDocument]
val baseEdgeDocument2 = new BaseEdgeDocument
baseEdgeDocument2.setKey("test_edge_2.com")
baseEdgeDocument2.setFrom("V_FQDN/test_edge_2_from")
baseEdgeDocument2.setTo("V_IP/test_edge_2_to")
baseEdgeDocument2.addAttribute("e_add_test_str", "1Two3")
baseEdgeDocument2.addAttribute("e_add_test_num", 4321)
docs.add(baseEdgeDocument2)
coll.importDocuments(docs)
arangoDB.shutdown()
*/
/*
val uri: String = "remote:192.168.40.127/iplearning-insert"
val pool = new OPartitionedDatabasePool(uri, "root", "111111", 5, 5)
factory = new OrientGraphFactory(uri, "root", "111111", pool)
val graph = factory.getNoTx
val ip = "23.224.224.163"
import scala.collection.JavaConversions._
/*
for (v: Vertex <- graph.getVertices("v_IP.IP", ip)) {
val update_IP_Last = v.getProperty[Long]("LAST_FOUND_TIME")
val update_IP_Cnt = v.getProperty[Long]("IP_APPEAR_COUNT")
val sqlComm = new OCommandSQL(
s"UPDATE v_IP SET LAST_FOUND_TIME = $update_IP_Last,FQDN_APPEAR_COUNT = 100 "
+ s"WHERE IP == '$ip'")
Try(graph.command(sqlComm).execute())
println("update ip:" + ip)
}
*/
val v_IP_Obj = graph.addVertex("class:v_IP", Nil: _*)
v_IP_Obj.setProperty("IP", ip)
v_IP_Obj.setProperty("IP_LOCATION", "fas")
v_IP_Obj.setProperty("FIRST_FOUND_TIME", 1)
v_IP_Obj.setProperty("LAST_FOUND_TIME", 1)
v_IP_Obj.setProperty("IP_APPEAR_COUNT", 1)
*/
/*
val spark: SparkSession = SparkSession
.builder()
.appName("test")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.network.timeout", "300s")
.config("spark.sql.shuffle.partitions", Config.SPARK_SQL_SHUFFLE_PARTITIONS)
.config("spark.executor.memory", Config.SPARK_EXECUTOR_MEMORY)
.config("arangodb.hosts", "192.168.40.127:8529")
.config("arangodb.user", "root")
.config("arangodb.password", "111111")
.master(Config.MASTER)
.getOrCreate()
val value: ArangoRDD[BaseDocument] = ArangoSpark
.load[BaseDocument](spark.sparkContext,"V_FQDN",ReadOptions("insert_iplearn_index"))
// var stringToDocument: Map[String, BaseDocument] = Map[String,BaseDocument]()
val lstBuffer: ListBuffer[(String, BaseDocument)] = ListBuffer[(String, BaseDocument)]()
val map: Map[String, BaseDocument] = value.map(doc => (doc.getKey,doc)).collect().toMap
println(map.size)
spark.close()
*/
/*
arangoDB = new ArangoDB.Builder()
.maxConnections(10)
.host("192.168.40.127", 8529)
.user("root")
.password("111111")
.build
val db = arangoDB.db("insert_iplearn_index")
// db.createCollection("V_FQDN")
// db.createCollection("V_IP")
// db.createCollection("E_ADDRESS_V_FQDN_TO_V_IP")
// db.createCollection("E_VISIT_V_IP_TO_V_FQDN")
val v_FQDN_Coll = db.collection("E_VISIT_V_IP_TO_V_FQDN")
*/
// val coll: ArangoCollection = db.collection("V_FQDN")
// val value = coll.getDocument("test1.com",classOf[BaseDocument])
// val str = value.getAttribute("v_fqdn_test_str")
// val num: Int = value.getAttribute("v_fqdn_test_num").toString.toInt
// println(str+"-"+num)
/*
val docs = new util.ArrayList[BaseDocument]
val baseDocument1 = new BaseDocument
baseDocument1.setKey("test1.com")
baseDocument1.addAttribute("v_fqdn_test_str", "one2three")
baseDocument1.addAttribute("v_fqdn_test_num", 1234)
docs.add(baseDocument1)
val baseDocument2 = new BaseDocument
baseDocument2.setKey("test2.com")
baseDocument2.addAttribute("v_fqdn_test_str", "1Two3")
baseDocument2.addAttribute("v_fqdn_test_num", 4321)
docs.add(baseDocument2)
coll.importDocuments(docs)
*/
// arangoDB.shutdown()
}
}