[GAL-504] 优化File Chunk Combiner性能及功能

This commit is contained in:
houjinchuan
2024-02-29 19:03:07 +08:00
parent a8c3277eba
commit 644ca7f35c
14 changed files with 888 additions and 134 deletions

View File

@@ -3,12 +3,12 @@ package com.zdjizhi;
import com.zdjizhi.config.Configs;
import com.zdjizhi.function.*;
import com.zdjizhi.pojo.*;
import com.zdjizhi.sink.HBaseSink;
import com.zdjizhi.sink.HosSink;
import com.zdjizhi.kafka.KafkaConsumer;
import com.zdjizhi.trigger.LastChunkOrNoDataInTimeTrigger;
import com.zdjizhi.trigger.MultipleTrigger;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.configuration.Configuration;
@@ -24,9 +24,8 @@ import org.apache.flink.util.OutputTag;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
public class FileChunkCombiner extends KafkaConsumer {
public class FileChunkCombiner {
public static void main(String[] args) throws Exception {
final ParameterTool parameterTool = ParameterTool.fromPropertiesFile(args[0]);
@@ -43,7 +42,7 @@ public class FileChunkCombiner extends KafkaConsumer {
.name("Kafka Source")
.map(new ParseMessagePackMapFunction())
.name("Map: Parse Message Pack")
.filter((FilterFunction<FileChunk>) Objects::nonNull)
.filter(new FileChunkFilter(configuration.getLong(Configs.FILE_MAX_SIZE), configuration.getString(Configs.FILTER_EXPRESSION)))
.assignTimestampsAndWatermarks(watermarkStrategy);
OutputTag<FileChunk> delayedChunkOutputTag = new OutputTag<FileChunk>("delayed-chunk") {
@@ -63,14 +62,23 @@ public class FileChunkCombiner extends KafkaConsumer {
.setParallelism(configuration.get(Configs.COMBINER_WINDOW_PARALLELISM))
.disableChaining();
HosSink hosSink = new HosSink(configuration);
windowStream.addSink(hosSink)
.name("Hos")
.setParallelism(configuration.get(Configs.SINK_HOS_PARALLELISM));
windowStream.getSideOutput(delayedChunkOutputTag)
.map(new SideOutputMapFunction())
.addSink(hosSink)
.name("Hos Delayed Chunk");
if ("hos".equals(configuration.get(Configs.SINK_TYPE))) {
windowStream.addSink(new HosSink(configuration))
.name("Hos")
.setParallelism(configuration.get(Configs.SINK_PARALLELISM));
windowStream.getSideOutput(delayedChunkOutputTag)
.map(new SideOutputMapFunction())
.addSink(new HosSink(configuration))
.name("Hos Delayed Chunk");
} else {
windowStream.addSink(new HBaseSink(configuration))
.name("HBase")
.setParallelism(configuration.get(Configs.SINK_PARALLELISM));
windowStream.getSideOutput(delayedChunkOutputTag)
.map(new SideOutputMapFunction())
.addSink(new HBaseSink(configuration))
.name("HBase Delayed Chunk");
}
environment.execute(configuration.get(Configs.FLINK_JOB_NAME));
}