1.过滤异常数据 2.优化sink写入代码 3.优化clickhouse配置
This commit is contained in:
@@ -2,14 +2,14 @@ package com.zdjizhi.etl.connection;
|
||||
|
||||
import cn.hutool.core.convert.Convert;
|
||||
import com.alibaba.fastjson.util.TypeUtils;
|
||||
import com.arangodb.entity.BaseEdgeDocument;
|
||||
import com.zdjizhi.etl.LogService;
|
||||
import com.zdjizhi.etl.dns.SketchTimeMapFunction;
|
||||
import com.zdjizhi.utils.arangodb.AGSink;
|
||||
import com.zdjizhi.utils.kafka.KafkaConsumer;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.streaming.api.datastream.DataStream;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
|
||||
import java.time.Duration;
|
||||
@@ -36,16 +36,21 @@ public class ConnLogService {
|
||||
//写入ck通联relation表
|
||||
LogService.getLogCKSink(connTransformStream, SINK_CK_TABLE_RELATION_CONNECTION);
|
||||
} else {
|
||||
LogService.getLogKafkaSink(connSource, SINK_CK_TABLE_CONNECTION);
|
||||
LogService.getLogKafkaSink(sketchSource, SINK_CK_TABLE_SKETCH);
|
||||
LogService.getLogKafkaSink(connTransformStream, SINK_KAFKA_TOPIC_RELATION_CONNECTION);
|
||||
}
|
||||
|
||||
DataStream<Map<String, Object>> sketchTransformStream = getSketchTransformStream(sketchSource);
|
||||
if (SINK_ARANGODB_RAW_LOG_INSERT_OPEN == 1) {
|
||||
|
||||
//合并通联和通联sketch
|
||||
DataStream<Map<String, Object>> ip2ipGraph = getConnUnion(connTransformStream, sketchTransformStream);
|
||||
DataStream<Map<String, Object>> sketchTransformStream = getSketchTransformStream(sketchSource);
|
||||
|
||||
//写入arangodb
|
||||
LogService.getLogArangoSink(ip2ipGraph, R_VISIT_IP2IP);
|
||||
//合并通联和通联sketch
|
||||
DataStream<BaseEdgeDocument> ip2ipGraph = getConnUnion(connTransformStream, sketchTransformStream);
|
||||
|
||||
//写入arangodb
|
||||
ConnLogService.getLogArangoSink(ip2ipGraph, R_VISIT_IP2IP);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -59,21 +64,24 @@ public class ConnLogService {
|
||||
|
||||
String timeFilter = SOURCE_KAFKA_TOPIC_CONNECTION.equals(source) ? "conn_start_time" : "sketch_start_time";
|
||||
|
||||
SingleOutputStreamOperator<Map<String, Object>> filterStream = env.addSource(KafkaConsumer.myDeserializationConsumer(source))
|
||||
DataStream<Map<String, Object>> filterStream = env.addSource(KafkaConsumer.myDeserializationConsumer(source))
|
||||
.setParallelism(SOURCE_PARALLELISM)
|
||||
.filter(x -> {
|
||||
if (Objects.isNull(x) || Convert.toLong(x.get(timeFilter)) <= 0) {
|
||||
return false;
|
||||
}
|
||||
if (SOURCE_KAFKA_TOPIC_CONNECTION.equals(source)) {
|
||||
if (String.valueOf(x.get("total_cs_pkts")).length() >= AGGREGATE_MAX_VALUE_LENGTH || String.valueOf(x.get("total_sc_pkts")).length() >= AGGREGATE_MAX_VALUE_LENGTH ||
|
||||
String.valueOf(x.get("total_cs_bytes")).length() >= AGGREGATE_MAX_VALUE_LENGTH || String.valueOf(x.get("total_sc_bytes")).length() >= AGGREGATE_MAX_VALUE_LENGTH) {
|
||||
if (TypeUtils.castToLong(x.get("total_cs_pkts")) < 0 || TypeUtils.castToLong(x.get("total_cs_pkts")) == Long.MAX_VALUE
|
||||
|| TypeUtils.castToLong(x.get("total_sc_pkts")) < 0 || TypeUtils.castToLong(x.get("total_sc_pkts")) == Long.MAX_VALUE
|
||||
|| TypeUtils.castToLong(x.get("total_cs_bytes")) < 0 || TypeUtils.castToLong(x.get("total_cs_bytes")) == Long.MAX_VALUE
|
||||
|| TypeUtils.castToLong(x.get("total_sc_bytes")) < 0 || TypeUtils.castToLong(x.get("total_sc_bytes")) == Long.MAX_VALUE) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} else if (SOURCE_KAFKA_TOPIC_SKETCH.equals(source)) {
|
||||
if (String.valueOf(x.get("sketch_sessions")).length() >= AGGREGATE_MAX_VALUE_LENGTH || String.valueOf(x.get("sketch_packets")).length() >= AGGREGATE_MAX_VALUE_LENGTH ||
|
||||
String.valueOf(x.get("sketch_bytes")).length() >= AGGREGATE_MAX_VALUE_LENGTH) {
|
||||
if (TypeUtils.castToLong(x.get("sketch_sessions")) < 0 || TypeUtils.castToLong(x.get("sketch_sessions")) == Long.MAX_VALUE
|
||||
|| TypeUtils.castToLong(x.get("sketch_packets")) < 0 || TypeUtils.castToLong(x.get("sketch_packets")) == Long.MAX_VALUE
|
||||
|| TypeUtils.castToLong(x.get("sketch_bytes")) < 0 || TypeUtils.castToLong(x.get("sketch_bytes")) == Long.MAX_VALUE) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@@ -96,8 +104,10 @@ public class ConnLogService {
|
||||
}))
|
||||
.setParallelism(TRANSFORM_PARALLELISM)
|
||||
.keyBy(new IpKeysSelector())
|
||||
.window(TumblingEventTimeWindows.of(Time.seconds(LOG_AGGREGATE_DURATION)))
|
||||
.window(TumblingProcessingTimeWindows.of(Time.seconds(LOG_AGGREGATE_DURATION)))
|
||||
.process(new ConnProcessFunction())
|
||||
.setParallelism(TRANSFORM_PARALLELISM)
|
||||
.filter(x -> Objects.nonNull(x) && TypeUtils.castToLong(x.get("sessions")) >= 0 && TypeUtils.castToLong(x.get("packets")) >= 0 && TypeUtils.castToLong(x.get("bytes")) >= 0)
|
||||
.setParallelism(TRANSFORM_PARALLELISM);
|
||||
return connTransformStream;
|
||||
}
|
||||
@@ -107,18 +117,28 @@ public class ConnLogService {
|
||||
.<Map<String, Object>>forBoundedOutOfOrderness(Duration.ofSeconds(FLINK_WATERMARK_MAX_DELAY_TIME))
|
||||
.withTimestampAssigner((event, timestamp) -> TypeUtils.castToLong(event.get("sketch_start_time")) * 1000))
|
||||
.keyBy(new IpKeysSelector())
|
||||
.window(TumblingEventTimeWindows.of(Time.seconds(LOG_AGGREGATE_DURATION)))
|
||||
.process(new SketchProcessFunction());
|
||||
.window(TumblingProcessingTimeWindows.of(Time.seconds(LOG_AGGREGATE_DURATION)))
|
||||
.process(new SketchProcessFunction())
|
||||
.setParallelism(TRANSFORM_PARALLELISM)
|
||||
.filter(x -> Objects.nonNull(x) && TypeUtils.castToLong(x.get("sessions")) >= 0 && TypeUtils.castToLong(x.get("packets")) >= 0 && TypeUtils.castToLong(x.get("bytes")) >= 0)
|
||||
.setParallelism(TRANSFORM_PARALLELISM);
|
||||
return sketchTransformStream;
|
||||
}
|
||||
|
||||
private static DataStream<Map<String, Object>> getConnUnion(DataStream<Map<String, Object>> connTransformStream, DataStream<Map<String, Object>> sketchTransformStream) throws Exception {
|
||||
DataStream<Map<String, Object>> ip2ipGraph = connTransformStream.union(sketchTransformStream)
|
||||
private static DataStream<BaseEdgeDocument> getConnUnion(DataStream<Map<String, Object>> connTransformStream, DataStream<Map<String, Object>> sketchTransformStream) throws Exception {
|
||||
DataStream<BaseEdgeDocument> ip2ipGraph = connTransformStream.union(sketchTransformStream)
|
||||
.keyBy(new IpKeysSelector())
|
||||
.window(TumblingEventTimeWindows.of(Time.seconds(LOG_AGGREGATE_DURATION_GRAPH)))
|
||||
.window(TumblingProcessingTimeWindows.of(Time.seconds(LOG_AGGREGATE_DURATION_GRAPH)))
|
||||
.process(new Ip2IpGraphProcessFunction())
|
||||
.setParallelism(TRANSFORM_PARALLELISM);
|
||||
return ip2ipGraph;
|
||||
}
|
||||
|
||||
public static void getLogArangoSink(DataStream<BaseEdgeDocument> sourceStream, String sink) throws Exception {
|
||||
sourceStream.addSink(new AGSink(sink))
|
||||
.setParallelism(SINK_PARALLELISM)
|
||||
.name(sink)
|
||||
.setParallelism(SINK_PARALLELISM);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user