自定义ArangoRDD

This commit is contained in:
wanglihui
2020-10-23 10:02:28 +08:00
parent db8e764e00
commit c211d99c2e
12 changed files with 825 additions and 1 deletions

View File

@@ -0,0 +1,40 @@
package cn.ac.iie.spark
import cn.ac.iie.spark.rdd.ReadOptions
import cn.ac.iie.utils.SparkSessionUtil
import com.arangodb.entity.BaseDocument
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
object RDDTest {
def main(args: Array[String]): Unit = {
val sparkContext = SparkSessionUtil.spark.sparkContext
println(sparkContext.getConf.get("arangodb.hosts"))
// val options = ReadOptions("iplearn_media_domain").copy(collection = "R_LOCATE_FQDN2IP")
val options = ReadOptions("ip-learning-test-0")
val ipOptions = options.copy(collection = "IP")
val rdd = ArangoSpark.load[BaseDocument](sparkContext,"IP",options)
println(rdd.count())
println(rdd.getNumPartitions)
val value: RDD[BaseDocument] = rdd.filter(doc => doc.getAttribute("CLIENT_SESSION_COUNT").asInstanceOf[Long] > 100).map(doc => {
doc.addAttribute("abc", 1)
doc
})
value.persist(StorageLevel.MEMORY_AND_DISK)
value.foreach(row => println(row.toString))
println(value.count())
SparkSessionUtil.spark.close()
System.exit(0)
}
}