79 lines
2.8 KiB
Scala
79 lines
2.8 KiB
Scala
|
|
package cn.ac.iie
|
||
|
|
|
||
|
|
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||
|
|
|
||
|
|
object HiveUnionTest {
|
||
|
|
def main(args: Array[String]): Unit = {
|
||
|
|
|
||
|
|
val spark: SparkSession = SparkSession
|
||
|
|
.builder()
|
||
|
|
.appName("test")
|
||
|
|
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||
|
|
.config("spark.network.timeout", "300s")
|
||
|
|
.config("spark.sql.shuffle.partitions", 50)
|
||
|
|
.master("local[*]")
|
||
|
|
|
||
|
|
/*
|
||
|
|
.config("spark.executor.memory", "30g")
|
||
|
|
.config("spark.driver.host", "192.168.41.79")
|
||
|
|
.config("spark.jars", "D:\\GITREPO\\ip-learning\\target\\ip-learning-1.0-SNAPSHOT-jar-with-dependencies.jar")
|
||
|
|
.master("spark://192.168.40.119:7077")
|
||
|
|
*/
|
||
|
|
|
||
|
|
.getOrCreate()
|
||
|
|
|
||
|
|
val sql =
|
||
|
|
"""
|
||
|
|
|(SELECT s1_s_ip AS IP, s1_s_location_region AS location,recv_time FROM av_miner.media_expire_patch limit 100
|
||
|
|
|UNION ALL
|
||
|
|
|SELECT s1_d_ip AS IP, s1_d_location_region AS location, recv_time FROM av_miner.media_expire_patch limit 100)
|
||
|
|
""".stripMargin
|
||
|
|
|
||
|
|
val mediaDataFrame: DataFrame = spark.read.format("jdbc")
|
||
|
|
.option("url", "jdbc:clickhouse://192.168.40.193:8123")
|
||
|
|
.option("dbtable","(select * from av_miner.media_expire_patch limit 100) as media_expire_patch")
|
||
|
|
// .option("dbtable", "av_miner.media_expire_patch")
|
||
|
|
// .option("dbtable", sql + " as a")
|
||
|
|
.option("driver", "ru.yandex.clickhouse.ClickHouseDriver")
|
||
|
|
.option("user", "default")
|
||
|
|
.option("password", "111111")
|
||
|
|
.option("numPartitions", "40")
|
||
|
|
.option("fetchsize", "1000000")
|
||
|
|
.load()
|
||
|
|
mediaDataFrame.printSchema()
|
||
|
|
mediaDataFrame.createOrReplaceGlobalTempView("media_expire_patch")
|
||
|
|
val frame = spark.sql(
|
||
|
|
"""
|
||
|
|
select IP,location,MIN(recv_time) AS FIRST_FOUND_TIME, MAX(recv_time) AS LAST_FOUND_TIME, COUNT(*) AS COUNT_TOTAL from (
|
||
|
|
(SELECT s1_s_ip AS IP, s1_s_location_region AS location,recv_time FROM global_temp.media_expire_patch limit 100)
|
||
|
|
UNION ALL
|
||
|
|
(SELECT s1_d_ip AS IP, s1_d_location_region AS location, recv_time FROM global_temp.media_expire_patch limit 100)
|
||
|
|
) group by IP,location
|
||
|
|
""".stripMargin)
|
||
|
|
|
||
|
|
val e_Address_v_FQDN_to_v_IP_DF = spark.sql(
|
||
|
|
"""
|
||
|
|
|SELECT
|
||
|
|
| media_domain AS V_FQDN,
|
||
|
|
| media_type,
|
||
|
|
| s1_d_ip AS V_IP,
|
||
|
|
| MIN( recv_time ) AS FIRST_FOUND_TIME,
|
||
|
|
| MAX( recv_time ) AS LAST_FOUND_TIME,
|
||
|
|
| COUNT( * ) AS COUNT_TOTAL
|
||
|
|
|FROM
|
||
|
|
| global_temp.media_expire_patch
|
||
|
|
|WHERE
|
||
|
|
| ( media_domain != '' )
|
||
|
|
| AND ( s1_d_ip != '' )
|
||
|
|
|GROUP BY
|
||
|
|
| s1_d_ip,
|
||
|
|
| media_domain,
|
||
|
|
| media_type
|
||
|
|
""".stripMargin)
|
||
|
|
e_Address_v_FQDN_to_v_IP_DF.printSchema()
|
||
|
|
e_Address_v_FQDN_to_v_IP_DF.show(200)
|
||
|
|
// mediaDataFrame.show(20)
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|