优化配置

This commit is contained in:
houjinchuan
2024-07-08 10:07:07 +08:00
parent caf3c7ff84
commit 942acad964
14 changed files with 240 additions and 378 deletions

View File

@@ -6,7 +6,7 @@
<groupId>com.zdjizhi</groupId> <groupId>com.zdjizhi</groupId>
<artifactId>file-chunk-combiner</artifactId> <artifactId>file-chunk-combiner</artifactId>
<version>1.3.1</version> <version>1.3.2</version>
<repositories> <repositories>
<repository> <repository>

View File

@@ -6,31 +6,22 @@ import com.zdjizhi.function.*;
import com.zdjizhi.function.map.ParseMessagePackMapFunction; import com.zdjizhi.function.map.ParseMessagePackMapFunction;
import com.zdjizhi.function.map.ParseProxyFileMetaFlatMapFunction; import com.zdjizhi.function.map.ParseProxyFileMetaFlatMapFunction;
import com.zdjizhi.function.map.ParseSessionFileMetaFlatMapFunction; import com.zdjizhi.function.map.ParseSessionFileMetaFlatMapFunction;
import com.zdjizhi.function.map.SideOutputMapFunction;
import com.zdjizhi.kafka.FileMetaKafkaConsumer; import com.zdjizhi.kafka.FileMetaKafkaConsumer;
import com.zdjizhi.pojo.*; import com.zdjizhi.pojo.*;
import com.zdjizhi.sink.*; import com.zdjizhi.sink.*;
import com.zdjizhi.kafka.KafkaConsumer; import com.zdjizhi.kafka.KafkaConsumer;
import com.zdjizhi.trigger.IdleTimeTrigger;
import com.zdjizhi.trigger.LastChunkOrNoDataInTimeTrigger;
import com.zdjizhi.trigger.LastChunkTrigger; import com.zdjizhi.trigger.LastChunkTrigger;
import com.zdjizhi.trigger.MultipleTrigger; import com.zdjizhi.trigger.MultipleTrigger;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo; import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.*; import org.apache.flink.streaming.api.datastream.*;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows; import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time; import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.triggers.EventTimeTrigger; import org.apache.flink.streaming.api.windowing.triggers.*;
import org.apache.flink.streaming.api.windowing.triggers.ProcessingTimeTrigger;
import org.apache.flink.streaming.api.windowing.triggers.Trigger;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow; import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.OutputTag;
import java.time.Duration;
import java.util.*; import java.util.*;
public class FileChunkCombiner { public class FileChunkCombiner {
@@ -41,41 +32,6 @@ public class FileChunkCombiner {
final StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment(); final StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
environment.getConfig().setGlobalJobParameters(configuration); environment.getConfig().setGlobalJobParameters(configuration);
SingleOutputStreamOperator<FileChunk> windowStream;
OutputTag<FileChunk> delayedChunkOutputTag = new OutputTag<FileChunk>("delayed-chunk") {
};
if (configuration.getInteger(Configs.COMBINER_WINDOW_TYPE) == 0) {
WatermarkStrategy<FileChunk> watermarkStrategy = WatermarkStrategy
.<FileChunk>forBoundedOutOfOrderness(Duration.ofSeconds(configuration.get(Configs.COMBINER_WINDOW_ALLOWED_LATENESS)))
.withTimestampAssigner((FileChunk, timestamp) -> FileChunk.getTimestamp() / 1000);
SingleOutputStreamOperator<FileChunk> parseMessagePackStream = environment
.addSource(KafkaConsumer.byteArrayConsumer(configuration))
.name(configuration.get(Configs.KAFKA_TOPIC))
.map(new ParseMessagePackMapFunction())
.name("Map: Parse Message Pack")
.filter(new FileChunkFilterFunction(configuration.getString(Configs.MAP_FILTER_EXPRESSION), "map_parse_message_pack"))
.name("Filter: Map")
.assignTimestampsAndWatermarks(watermarkStrategy);
List<Trigger<Object, TimeWindow>> triggers = new ArrayList<>();
triggers.add(EventTimeTrigger.create());
if (configuration.get(Configs.COMBINER_WINDOW_ENABLE_LAST_CHUNK_TRIGGER)) {
triggers.add(LastChunkOrNoDataInTimeTrigger.of(configuration.get(Configs.COMBINER_WINDOW_IDLE_TIME) * 1000));
} else {
triggers.add(IdleTimeTrigger.of(configuration.get(Configs.COMBINER_WINDOW_IDLE_TIME) * 1000));
}
Trigger<Object, TimeWindow> trigger = MultipleTrigger.of(triggers);
windowStream = parseMessagePackStream
.keyBy(new FileChunkKeySelector(), BasicTypeInfo.STRING_TYPE_INFO)
.window(TumblingEventTimeWindows.of(Time.seconds(configuration.get(Configs.COMBINER_WINDOW_TIME))))
.trigger(trigger)
.sideOutputLateData(delayedChunkOutputTag)
.allowedLateness(Time.seconds(configuration.get(Configs.COMBINER_WINDOW_ALLOWED_LATENESS)))
.process(new CombineChunkProcessWindowFunction(configuration.get(Configs.FILE_MAX_CHUNK_COUNT)))
.name("Window: Combine Chunk")
.setParallelism(configuration.get(Configs.COMBINER_WINDOW_PARALLELISM));
} else {
SingleOutputStreamOperator<FileChunk> parseMessagePackStream = environment SingleOutputStreamOperator<FileChunk> parseMessagePackStream = environment
.addSource(KafkaConsumer.byteArrayConsumer(configuration)) .addSource(KafkaConsumer.byteArrayConsumer(configuration))
.name(configuration.get(Configs.KAFKA_TOPIC)) .name(configuration.get(Configs.KAFKA_TOPIC))
@@ -90,19 +46,19 @@ public class FileChunkCombiner {
triggers.add(LastChunkTrigger.create()); triggers.add(LastChunkTrigger.create());
} }
Trigger<Object, TimeWindow> trigger = MultipleTrigger.of(triggers); Trigger<Object, TimeWindow> trigger = MultipleTrigger.of(triggers);
windowStream = parseMessagePackStream SingleOutputStreamOperator<FileChunk> windowStream = parseMessagePackStream
.keyBy(new FileChunkKeySelector(), BasicTypeInfo.STRING_TYPE_INFO) .keyBy(new FileChunkKeySelector(), BasicTypeInfo.STRING_TYPE_INFO)
.window(TumblingProcessingTimeWindows.of(Time.seconds(configuration.get(Configs.COMBINER_WINDOW_TIME)))) .window(TumblingProcessingTimeWindows.of(Time.seconds(configuration.get(Configs.COMBINER_WINDOW_SIZE))))
.trigger(trigger) .trigger(trigger)
.process(new CombineChunkProcessWindowFunction(configuration.get(Configs.FILE_MAX_CHUNK_COUNT))) .process(new CombineChunkProcessWindowFunction())
.name("Window: Combine Chunk") .name("Window: Combine Chunk")
.setParallelism(configuration.get(Configs.COMBINER_WINDOW_PARALLELISM)); .setParallelism(configuration.get(Configs.COMBINER_WINDOW_PARALLELISM));
}
SingleOutputStreamOperator<FileChunk> fileMetaSessionSingleOutputStreamOperator;
SingleOutputStreamOperator<FileChunk> fileMetaProxySingleOutputStreamOperator;
for (String sinkType : configuration.get(Configs.SINK_TYPE).split(",")) { for (String sinkType : configuration.get(Configs.SINK_TYPE).split(",")) {
switch (sinkType) { switch (sinkType) {
case "hos": case "hos":
DataStream<FileChunk> sideOutput = windowStream.getSideOutput(delayedChunkOutputTag);
if (StrUtil.isNotEmpty(configuration.getString(Configs.SINK_FILTER_EXPRESSION))) { if (StrUtil.isNotEmpty(configuration.getString(Configs.SINK_FILTER_EXPRESSION))) {
windowStream windowStream
.filter(new FileChunkFilterFunction(configuration.getString(Configs.SINK_FILTER_EXPRESSION), "sink_hos")) .filter(new FileChunkFilterFunction(configuration.getString(Configs.SINK_FILTER_EXPRESSION), "sink_hos"))
@@ -111,24 +67,14 @@ public class FileChunkCombiner {
.addSink(new HosSink(configuration)) .addSink(new HosSink(configuration))
.name("Hos") .name("Hos")
.setParallelism(configuration.get(Configs.SINK_PARALLELISM)); .setParallelism(configuration.get(Configs.SINK_PARALLELISM));
sideOutput = sideOutput
.filter(new FileChunkFilterFunction(configuration.getString(Configs.SINK_FILTER_EXPRESSION), "side_out_put_hos"))
.name("Filter: Delayed Chunk")
.setParallelism(configuration.get(Configs.SINK_PARALLELISM));
} else { } else {
windowStream windowStream
.addSink(new HosSink(configuration)) .addSink(new HosSink(configuration))
.name("Hos") .name("Hos")
.setParallelism(configuration.get(Configs.SINK_PARALLELISM)); .setParallelism(configuration.get(Configs.SINK_PARALLELISM));
} }
sideOutput.map(new SideOutputMapFunction())
.setParallelism(configuration.get(Configs.SINK_PARALLELISM))
.addSink(new HosSink(configuration))
.setParallelism(configuration.get(Configs.SINK_PARALLELISM))
.name("Delayed Chunk");
break; break;
case "hbase": case "hbase":
sideOutput = windowStream.getSideOutput(delayedChunkOutputTag);
if (StrUtil.isNotEmpty(configuration.getString(Configs.SINK_FILTER_EXPRESSION))) { if (StrUtil.isNotEmpty(configuration.getString(Configs.SINK_FILTER_EXPRESSION))) {
windowStream windowStream
.filter(new FileChunkFilterFunction(configuration.getString(Configs.SINK_FILTER_EXPRESSION), "sink_hbase")) .filter(new FileChunkFilterFunction(configuration.getString(Configs.SINK_FILTER_EXPRESSION), "sink_hbase"))
@@ -137,42 +83,26 @@ public class FileChunkCombiner {
.addSink(new HBaseSink(configuration)) .addSink(new HBaseSink(configuration))
.name("HBase") .name("HBase")
.setParallelism(configuration.get(Configs.SINK_PARALLELISM)); .setParallelism(configuration.get(Configs.SINK_PARALLELISM));
sideOutput = sideOutput
.filter(new FileChunkFilterFunction(configuration.getString(Configs.SINK_FILTER_EXPRESSION), "side_out_put_hbase"))
.name("Filter: Delayed Chunk")
.setParallelism(configuration.get(Configs.SINK_PARALLELISM));
} else { } else {
windowStream windowStream
.addSink(new HBaseSink(configuration)) .addSink(new HBaseSink(configuration))
.name("HBase") .name("HBase")
.setParallelism(configuration.get(Configs.SINK_PARALLELISM)); .setParallelism(configuration.get(Configs.SINK_PARALLELISM));
} }
sideOutput
.map(new SideOutputMapFunction())
.setParallelism(configuration.get(Configs.SINK_PARALLELISM))
.addSink(new HBaseSink(configuration))
.setParallelism(configuration.get(Configs.SINK_PARALLELISM))
.name("Delayed Chunk");
break; break;
case "oss": case "oss":
SingleOutputStreamOperator<FileChunk> fileMetaSessionSingleOutputStreamOperator = environment.addSource(FileMetaKafkaConsumer.stringConsumer(configuration, configuration.get(Configs.KAFKA_FILE_META_SESSION_TOPIC))) fileMetaSessionSingleOutputStreamOperator = environment.addSource(FileMetaKafkaConsumer.stringConsumer(configuration, configuration.get(Configs.KAFKA_FILE_META_SESSION_TOPIC)))
.setParallelism(configuration.get(Configs.MAP_PARSE_FILE_META_PARALLELISM))
.name(configuration.get(Configs.KAFKA_FILE_META_SESSION_TOPIC)) .name(configuration.get(Configs.KAFKA_FILE_META_SESSION_TOPIC))
.flatMap(new ParseSessionFileMetaFlatMapFunction()) .flatMap(new ParseSessionFileMetaFlatMapFunction())
.setParallelism(configuration.get(Configs.MAP_PARSE_FILE_META_PARALLELISM))
.name("Map: Parse Session File Meta") .name("Map: Parse Session File Meta")
.filter(new FileChunkFilterFunction(configuration.getString(Configs.FILE_META_FILTER_EXPRESSION), "map_parse_session_file_meta")) .filter(new FileChunkFilterFunction(configuration.getString(Configs.FILE_META_FILTER_EXPRESSION), "map_parse_session_file_meta"))
.name("Filter: Map") .name("Filter: Map");
.setParallelism(configuration.get(Configs.MAP_PARSE_FILE_META_PARALLELISM)); fileMetaProxySingleOutputStreamOperator = environment.addSource(FileMetaKafkaConsumer.stringConsumer(configuration, configuration.get(Configs.KAFKA_FILE_META_PROXY_TOPIC)))
SingleOutputStreamOperator<FileChunk> fileMetaProxySingleOutputStreamOperator = environment.addSource(FileMetaKafkaConsumer.stringConsumer(configuration, configuration.get(Configs.KAFKA_FILE_META_PROXY_TOPIC)))
.setParallelism(configuration.get(Configs.MAP_PARSE_FILE_META_PARALLELISM))
.name(configuration.get(Configs.KAFKA_FILE_META_PROXY_TOPIC)) .name(configuration.get(Configs.KAFKA_FILE_META_PROXY_TOPIC))
.flatMap(new ParseProxyFileMetaFlatMapFunction()) .flatMap(new ParseProxyFileMetaFlatMapFunction())
.setParallelism(configuration.get(Configs.MAP_PARSE_FILE_META_PARALLELISM))
.name("Map: Parse Proxy File Meta") .name("Map: Parse Proxy File Meta")
.filter(new FileChunkFilterFunction(configuration.getString(Configs.FILE_META_FILTER_EXPRESSION), "map_parse_proxy_file_meta")) .filter(new FileChunkFilterFunction(configuration.getString(Configs.FILE_META_FILTER_EXPRESSION), "map_parse_proxy_file_meta"))
.name("Filter: Map") .name("Filter: Map");
.setParallelism(configuration.get(Configs.MAP_PARSE_FILE_META_PARALLELISM));
windowStream windowStream
.filter(new FileChunkFilterFunction(configuration.getString(Configs.SINK_OSS_FILTER_EXPRESSION), "sink_oss")) .filter(new FileChunkFilterFunction(configuration.getString(Configs.SINK_OSS_FILTER_EXPRESSION), "sink_oss"))
.name("Filter: Oss") .name("Filter: Oss")

View File

@@ -43,25 +43,16 @@ public class Configs {
.stringType() .stringType()
.noDefaultValue(); .noDefaultValue();
public static final ConfigOption<Integer> MAP_PARSE_FILE_META_PARALLELISM = ConfigOptions.key("map.parse.file.meta.parallelism") public static final ConfigOption<String> MAP_FILTER_EXPRESSION = ConfigOptions.key("map.filter.expression")
.intType() .stringType()
.defaultValue(1); .defaultValue("");
public static final ConfigOption<Integer> COMBINER_WINDOW_TYPE = ConfigOptions.key("combiner.window.type")
.intType()
.defaultValue(0);
public static final ConfigOption<Integer> COMBINER_WINDOW_PARALLELISM = ConfigOptions.key("combiner.window.parallelism") public static final ConfigOption<Integer> COMBINER_WINDOW_PARALLELISM = ConfigOptions.key("combiner.window.parallelism")
.intType() .intType()
.defaultValue(1); .defaultValue(1);
public static final ConfigOption<Long> COMBINER_WINDOW_TIME = ConfigOptions.key("combiner.window.time") public static final ConfigOption<Long> COMBINER_WINDOW_SIZE = ConfigOptions.key("combiner.window.size")
.longType() .longType()
.defaultValue(5L); .defaultValue(10L);
public static final ConfigOption<Long> COMBINER_WINDOW_ALLOWED_LATENESS = ConfigOptions.key("combiner.window.allowed.lateness")
.longType()
.defaultValue(0L);
public static final ConfigOption<Long> COMBINER_WINDOW_IDLE_TIME = ConfigOptions.key("combiner.window.idle.time")
.longType()
.defaultValue(5L);
public static final ConfigOption<Boolean> COMBINER_WINDOW_ENABLE_LAST_CHUNK_TRIGGER = ConfigOptions.key("combiner.window.enable.last.chunk.trigger") public static final ConfigOption<Boolean> COMBINER_WINDOW_ENABLE_LAST_CHUNK_TRIGGER = ConfigOptions.key("combiner.window.enable.last.chunk.trigger")
.booleanType() .booleanType()
.defaultValue(true); .defaultValue(true);
@@ -75,18 +66,6 @@ public class Configs {
public static final ConfigOption<Boolean> SINK_ASYNC = ConfigOptions.key("sink.async") public static final ConfigOption<Boolean> SINK_ASYNC = ConfigOptions.key("sink.async")
.booleanType() .booleanType()
.defaultValue(false); .defaultValue(false);
public static final ConfigOption<Boolean> SINK_BATCH = ConfigOptions.key("sink.batch")
.booleanType()
.defaultValue(false);
public static final ConfigOption<Integer> SINK_BATCH_COUNT = ConfigOptions.key("sink.batch.count")
.intType()
.defaultValue(1);
public static final ConfigOption<Long> SINK_BATCH_SIZE = ConfigOptions.key("sink.batch.size")
.longType()
.defaultValue(Long.MAX_VALUE);
public static final ConfigOption<Integer> SINK_BATCH_TIME = ConfigOptions.key("sink.batch.time")
.intType()
.defaultValue(5);
public static final ConfigOption<String> SINK_FILTER_EXPRESSION = ConfigOptions.key("sink.filter.expression") public static final ConfigOption<String> SINK_FILTER_EXPRESSION = ConfigOptions.key("sink.filter.expression")
.stringType() .stringType()
.defaultValue(""); .defaultValue("");
@@ -106,43 +85,39 @@ public class Configs {
public static final ConfigOption<String> SINK_HOS_TOKEN = ConfigOptions.key("sink.hos.token") public static final ConfigOption<String> SINK_HOS_TOKEN = ConfigOptions.key("sink.hos.token")
.stringType() .stringType()
.defaultValue(""); .defaultValue("");
public static final ConfigOption<Long> SINK_HOS_BATCH_SIZE = ConfigOptions.key("sink.hos.batch.size")
.longType()
.defaultValue(0L);
public static final ConfigOption<Integer> SINK_HOS_BATCH_INTERVAL_MS = ConfigOptions.key("sink.hos.batch.interval.ms")
.intType()
.defaultValue(0);
public static final ConfigOption<Integer> SINK_HTTP_MAX_TOTAL = ConfigOptions.key("sink.http.max.total") public static final ConfigOption<Integer> SINK_HTTP_CLIENT_MAX_TOTAL = ConfigOptions.key("sink.http.client.max.total")
.intType() .intType()
.defaultValue(2000); .defaultValue(100);
public static final ConfigOption<Integer> SINK_HTTP_MAX_PER_ROUTE = ConfigOptions.key("sink.http.max.per.route") public static final ConfigOption<Integer> SINK_HTTP_CLIENT_MAX_PER_ROUTE = ConfigOptions.key("sink.http.client.max.per.route")
.intType() .intType()
.defaultValue(1000); .defaultValue(100);
public static final ConfigOption<Integer> SINK_HTTP_ERROR_RETRY = ConfigOptions.key("sink.http.error.retry") public static final ConfigOption<Integer> SINK_HTTP_CLIENT_RETRIES_NUMBER = ConfigOptions.key("sink.http.client.retries.number")
.intType() .intType()
.defaultValue(3); .defaultValue(3);
public static final ConfigOption<Integer> SINK_HTTP_CONNECT_TIMEOUT = ConfigOptions.key("sink.http.connect.timeout") public static final ConfigOption<Integer> SINK_HTTP_CLIENT_CONNECT_TIMEOUT_MS = ConfigOptions.key("sink.http.client.connect.timeout.ms")
.intType() .intType()
.defaultValue(10000); .defaultValue(10000);
public static final ConfigOption<Integer> SINK_HTTP_REQUEST_TIMEOUT = ConfigOptions.key("sink.http.request.timeout") public static final ConfigOption<Integer> SINK_HTTP_CLIENT_REQUEST_TIMEOUT_MS = ConfigOptions.key("sink.http.client.request.timeout.ms")
.intType() .intType()
.defaultValue(10000); .defaultValue(10000);
public static final ConfigOption<Integer> SINK_HTTP_SOCKET_TIMEOUT = ConfigOptions.key("sink.http.socket.timeout") public static final ConfigOption<Integer> SINK_HTTP_CLIENT_SOCKET_TIMEOUT_MS = ConfigOptions.key("sink.http.client.socket.timeout.ms")
.intType() .intType()
.defaultValue(60000); .defaultValue(60000);
public static final ConfigOption<Boolean> SINK_OSS_ASYNC = ConfigOptions.key("sink.oss.async")
.booleanType()
.defaultValue(false);
public static final ConfigOption<String> SINK_OSS_ENDPOINT = ConfigOptions.key("sink.oss.endpoint")
.stringType()
.noDefaultValue();
public static final ConfigOption<String> SINK_OSS_FILTER_EXPRESSION = ConfigOptions.key("sink.oss.filter.expression")
.stringType()
.defaultValue("");
public static final ConfigOption<String> SINK_HBASE_ZOOKEEPER = ConfigOptions.key("sink.hbase.zookeeper") public static final ConfigOption<String> SINK_HBASE_ZOOKEEPER = ConfigOptions.key("sink.hbase.zookeeper")
.stringType() .stringType()
.defaultValue(""); .defaultValue("127.0.0.1");
public static final ConfigOption<Integer> SINK_HBASE_RETRIES_NUMBER = ConfigOptions.key("sink.hbase.retries.number") public static final ConfigOption<Integer> SINK_HBASE_RETRIES_NUMBER = ConfigOptions.key("sink.hbase.retries.number")
.intType() .intType()
.defaultValue(10); .defaultValue(10);
public static final ConfigOption<Integer> SINK_HBASE_RPC_TIMEOUT = ConfigOptions.key("sink.hbase.rpc.timeout") public static final ConfigOption<Integer> SINK_HBASE_RPC_TIMEOUT_MS = ConfigOptions.key("sink.hbase.rpc.timeout.ms")
.intType() .intType()
.defaultValue(600000); .defaultValue(600000);
public static final ConfigOption<Integer> SINK_HBASE_CLIENT_WRITE_BUFFER = ConfigOptions.key("sink.hbase.client.write.buffer") public static final ConfigOption<Integer> SINK_HBASE_CLIENT_WRITE_BUFFER = ConfigOptions.key("sink.hbase.client.write.buffer")
@@ -150,18 +125,29 @@ public class Configs {
.defaultValue(10485760); .defaultValue(10485760);
public static final ConfigOption<Integer> SINK_HBASE_CLIENT_IPC_POOL_SIZE = ConfigOptions.key("sink.hbase.client.ipc.pool.size") public static final ConfigOption<Integer> SINK_HBASE_CLIENT_IPC_POOL_SIZE = ConfigOptions.key("sink.hbase.client.ipc.pool.size")
.intType() .intType()
.defaultValue(1); .defaultValue(3);
public static final ConfigOption<Long> SINK_HBASE_BATCH_SIZE = ConfigOptions.key("sink.hbase.batch.size")
public static final ConfigOption<Integer> FILE_MAX_CHUNK_COUNT = ConfigOptions.key("file.max.chunk.count") .longType()
.defaultValue(0L);
public static final ConfigOption<Integer> SINK_HBASE_BATCH_INTERVAL_MS = ConfigOptions.key("sink.hbase.batch.interval.ms")
.intType() .intType()
.defaultValue(100000); .defaultValue(0);
public static final ConfigOption<String> MAP_FILTER_EXPRESSION = ConfigOptions.key("map.filter.expression")
public static final ConfigOption<String> SINK_OSS_ENDPOINT = ConfigOptions.key("sink.oss.endpoint")
.stringType()
.noDefaultValue();
public static final ConfigOption<String> SINK_OSS_FILTER_EXPRESSION = ConfigOptions.key("sink.oss.filter.expression")
.stringType() .stringType()
.defaultValue(""); .defaultValue("");
public static final ConfigOption<Long> SINK_OSS_CACHE_TIME_MS = ConfigOptions.key("sink.oss.cache.time.ms")
.longType()
.defaultValue(0L);
public static final ConfigOption<Long> SINK_OSS_CACHE_SIZE = ConfigOptions.key("sink.oss.cache.size")
.longType()
.defaultValue(0L);
public static final ConfigOption<String> FILE_META_FILTER_EXPRESSION = ConfigOptions.key("file.meta.filter.expression") public static final ConfigOption<String> FILE_META_FILTER_EXPRESSION = ConfigOptions.key("file.meta.filter.expression")
.stringType() .stringType()
.defaultValue(""); .defaultValue("");
public static final ConfigOption<String> KAFKA_FILE_META_SESSION_TOPIC = ConfigOptions.key("source.kafka.file.meta.session.topic") public static final ConfigOption<String> KAFKA_FILE_META_SESSION_TOPIC = ConfigOptions.key("source.kafka.file.meta.session.topic")
.stringType() .stringType()
.noDefaultValue(); .noDefaultValue();
@@ -171,10 +157,4 @@ public class Configs {
public static final ConfigOption<String> KAFKA_FILE_META_GROUP_ID = ConfigOptions.key("source.kafka.file.meta.group.id") public static final ConfigOption<String> KAFKA_FILE_META_GROUP_ID = ConfigOptions.key("source.kafka.file.meta.group.id")
.stringType() .stringType()
.defaultValue("file_chunk_combine_1"); .defaultValue("file_chunk_combine_1");
public static final ConfigOption<Long> FILE_META_CACHE_TIME = ConfigOptions.key("file.meta.cache.time")
.longType()
.defaultValue(0L);
public static final ConfigOption<Long> FILE_META_CACHE_SIZE = ConfigOptions.key("file.meta.cache.size")
.longType()
.defaultValue(0L);
} }

View File

@@ -36,11 +36,6 @@ public class CombineChunkProcessWindowFunction extends ProcessWindowFunction<Fil
public transient Counter nullChunksCounter; public transient Counter nullChunksCounter;
public transient Counter nullTxtChunksCounter; public transient Counter nullTxtChunksCounter;
public transient Counter nullEmlChunksCounter; public transient Counter nullEmlChunksCounter;
private final int fileMaxChunkCount;
public CombineChunkProcessWindowFunction(int fileMaxChunkCount) {
this.fileMaxChunkCount = fileMaxChunkCount;
}
@Override @Override
public void open(Configuration parameters) throws Exception { public void open(Configuration parameters) throws Exception {
@@ -166,9 +161,6 @@ public class CombineChunkProcessWindowFunction extends ProcessWindowFunction<Fil
waitingToCombineChunkList.add(chunk); waitingToCombineChunkList.add(chunk);
timestampAndSizes.append(originalFileChunk.getTimestamp()).append("-").append(chunk.length).append(";"); timestampAndSizes.append(originalFileChunk.getTimestamp()).append("-").append(chunk.length).append(";");
} }
if (waitingToCombineChunkList.size() > fileMaxChunkCount) {
break;
}
} }
if (waitingToCombineChunkList.size() > 0) { if (waitingToCombineChunkList.size() > 0) {
FileChunk fileChunk = combineChunk(waitingToCombineChunkList, originalFileChunkList.get(0).getUuid(), originalFileChunkList.get(0).getFileName(), originalFileChunkList.get(0).getFileType(), 0, "append", 0, originalFileChunkList.get(0).getMeta(), startTimestamp, timestampAndSizes.toString()); FileChunk fileChunk = combineChunk(waitingToCombineChunkList, originalFileChunkList.get(0).getUuid(), originalFileChunkList.get(0).getFileName(), originalFileChunkList.get(0).getFileType(), 0, "append", 0, originalFileChunkList.get(0).getMeta(), startTimestamp, timestampAndSizes.toString());

View File

@@ -188,9 +188,8 @@ public class ParseMessagePackMapFunction extends RichMapFunction<byte[], FileChu
chunksInCounter.inc(); chunksInCounter.inc();
bytesInCounter.inc(messagePackData.length); bytesInCounter.inc(messagePackData.length);
FileChunk fileChunk; FileChunk fileChunk;
try { try (MessageUnpacker messageUnpacker = MessagePack.newDefaultUnpacker(messagePackData)) {
fileChunk = new FileChunk(); fileChunk = new FileChunk();
MessageUnpacker messageUnpacker = MessagePack.newDefaultUnpacker(messagePackData);
int numFields = messageUnpacker.unpackMapHeader(); int numFields = messageUnpacker.unpackMapHeader();
Map<String, Object> metaMap = new HashMap<>(); Map<String, Object> metaMap = new HashMap<>();
for (int i = 0; i < numFields; i++) { for (int i = 0; i < numFields; i++) {

View File

@@ -1,33 +0,0 @@
package com.zdjizhi.function.map;
import com.zdjizhi.pojo.FileChunk;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.metrics.Counter;
import org.apache.flink.metrics.MeterView;
import org.apache.flink.metrics.MetricGroup;
import static com.zdjizhi.utils.PublicConstants.COMBINE_MODE_APPEND;
public class SideOutputMapFunction extends RichMapFunction<FileChunk, FileChunk> {
public transient Counter delayedChunksCounter;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
MetricGroup metricGroup = getRuntimeContext().getMetricGroup().addGroup("file_chunk_combiner", "side_out_put");
delayedChunksCounter = metricGroup.counter("delayedChunksCount");
metricGroup.meter("numChunksDelayPerSecond", new MeterView(delayedChunksCounter));
}
@Override
public FileChunk map(FileChunk fileChunk) {
delayedChunksCounter.inc();
fileChunk.setChunkCount(1);
if (COMBINE_MODE_APPEND.equals(fileChunk.getCombineMode())) {
fileChunk.setChunkNumbers(fileChunk.getTimestamp() + "-" + fileChunk.getChunk().length + ";");
}
return fileChunk;
}
}

View File

@@ -75,8 +75,8 @@ public class HBaseSink extends RichSinkFunction<FileChunk> {
private List<Put> indexTimePutList; private List<Put> indexTimePutList;
private List<Put> indexFilenamePutList; private List<Put> indexFilenamePutList;
private long chunkSize; private long chunkSize;
private long maxBatchSize; private long batchSize;
private long maxBatchCount; private long batchInterval;
private ScheduledExecutorService executorService; private ScheduledExecutorService executorService;
private long rateLimitThreshold; private long rateLimitThreshold;
private String rateLimitExpression; private String rateLimitExpression;
@@ -162,24 +162,21 @@ public class HBaseSink extends RichSinkFunction<FileChunk> {
indexFilenameTable = syncHBaseConnection.getTable(TableName.valueOf("default:index_filename_" + configuration.get(Configs.SINK_HOS_BUCKET))); indexFilenameTable = syncHBaseConnection.getTable(TableName.valueOf("default:index_filename_" + configuration.get(Configs.SINK_HOS_BUCKET)));
} }
timestamp = System.currentTimeMillis(); timestamp = System.currentTimeMillis();
if (configuration.get(Configs.SINK_BATCH)) { batchSize = configuration.getLong(Configs.SINK_HBASE_BATCH_SIZE);
maxBatchSize = configuration.getLong(Configs.SINK_BATCH_SIZE); batchInterval = configuration.getInteger(Configs.SINK_HBASE_BATCH_INTERVAL_MS);
maxBatchCount = configuration.getInteger(Configs.SINK_BATCH_COUNT);
dataPutList = new ArrayList<>(); dataPutList = new ArrayList<>();
indexTimePutList = new ArrayList<>(); indexTimePutList = new ArrayList<>();
indexFilenamePutList = new ArrayList<>(); indexFilenamePutList = new ArrayList<>();
if (batchSize > 0 && batchInterval > 0) {
chunkSize = 0; chunkSize = 0;
executorService = Executors.newScheduledThreadPool(1); executorService = Executors.newScheduledThreadPool(1);
long period = configuration.getInteger(Configs.SINK_BATCH_TIME);
executorService.scheduleWithFixedDelay(() -> { executorService.scheduleWithFixedDelay(() -> {
if (System.currentTimeMillis() - timestamp > (period * 1000)) {
if (!dataPutList.isEmpty()) {
synchronized (this) { synchronized (this) {
sendBatchData(); if (!dataPutList.isEmpty()) {
sendDataToHbase();
} }
} }
} }, batchInterval, batchInterval, TimeUnit.MILLISECONDS);
}, period, period, TimeUnit.SECONDS);
} }
if (rateLimitThreshold > 0) { if (rateLimitThreshold > 0) {
rateLimitThreshold = configuration.getLong(Configs.SINK_RATE_LIMIT_THRESHOLD); rateLimitThreshold = configuration.getLong(Configs.SINK_RATE_LIMIT_THRESHOLD);
@@ -217,7 +214,6 @@ public class HBaseSink extends RichSinkFunction<FileChunk> {
sendFileChunk(fileChunk); sendFileChunk(fileChunk);
} }
} else { } else {
timestamp = currentTimeMillis;
sendFileChunk(fileChunk); sendFileChunk(fileChunk);
} }
} }
@@ -236,7 +232,7 @@ public class HBaseSink extends RichSinkFunction<FileChunk> {
} }
private void sendFileChunk(FileChunk fileChunk) { private void sendFileChunk(FileChunk fileChunk) {
if (COMBINE_MODE_SEEK.equals(fileChunk.getCombineMode()) && configuration.get(Configs.SINK_BATCH)) { if (COMBINE_MODE_SEEK.equals(fileChunk.getCombineMode())) {
byte[] data = "".getBytes(); byte[] data = "".getBytes();
if (fileChunk.getChunk() != null) { if (fileChunk.getChunk() != null) {
data = fileChunk.getChunk(); data = fileChunk.getChunk();
@@ -282,13 +278,17 @@ public class HBaseSink extends RichSinkFunction<FileChunk> {
chunksOutCounter.inc(); chunksOutCounter.inc();
bytesOutCounter.inc(chunkLength); bytesOutCounter.inc(chunkLength);
calculateFileChunkMetrics(fileChunk); calculateFileChunkMetrics(fileChunk);
if (chunkSize >= maxBatchSize || dataPutList.size() >= maxBatchCount) { if (batchSize > 0 && batchInterval > 0) {
sendBatchData(); if (chunkSize >= batchSize) {
sendDataToHbase();
}
} else {
sendDataToHbase();
} }
} }
} }
private void sendBatchData() { private void sendDataToHbase() {
if (isAsync) { if (isAsync) {
List<CompletableFuture<Object>> futures = asyncTable.batch(dataPutList); List<CompletableFuture<Object>> futures = asyncTable.batch(dataPutList);
CompletableFuture.supplyAsync(() -> { CompletableFuture.supplyAsync(() -> {
@@ -303,18 +303,27 @@ public class HBaseSink extends RichSinkFunction<FileChunk> {
return null; return null;
}); });
dataPutList.clear(); dataPutList.clear();
if (indexTimePutList.size() > 0) {
asyncIndexTimeTable.batch(indexTimePutList); asyncIndexTimeTable.batch(indexTimePutList);
indexTimePutList.clear(); indexTimePutList.clear();
}
if (indexFilenamePutList.size() > 0) {
asyncIndexFilenameTable.batch(indexFilenamePutList); asyncIndexFilenameTable.batch(indexFilenamePutList);
indexFilenamePutList.clear(); indexFilenamePutList.clear();
}
} else { } else {
try { try {
table.batch(dataPutList, null); table.batch(dataPutList, null);
if (indexTimePutList.size() > 0) {
indexTimeTable.batch(indexTimePutList, null); indexTimeTable.batch(indexTimePutList, null);
}
if (indexFilenamePutList.size() > 0) {
indexFilenameTable.batch(indexFilenamePutList, null); indexFilenameTable.batch(indexFilenamePutList, null);
}
} catch (IOException | InterruptedException e) { } catch (IOException | InterruptedException e) {
LOG.error("Put chunk to hbase error. ", e.getMessage()); LOG.error("Put chunk to hbase error. ", e.getMessage());
errorChunksCounter.inc(dataPutList.size()); errorChunksCounter.inc(dataPutList.size());
Thread.currentThread().interrupt();
} finally { } finally {
dataPutList.clear(); dataPutList.clear();
indexTimePutList.clear(); indexTimePutList.clear();

View File

@@ -79,8 +79,8 @@ public class HosSink extends RichSinkFunction<FileChunk> {
private String objectsMeta; private String objectsMeta;
private String objectsOffset; private String objectsOffset;
private List<byte[]> byteList; private List<byte[]> byteList;
private long maxBatchSize; private long batchSize;
private long maxBatchCount; private long batchInterval;
private long chunkSize; private long chunkSize;
private ScheduledExecutorService executorService; private ScheduledExecutorService executorService;
private long rateLimitThreshold; private long rateLimitThreshold;
@@ -171,26 +171,23 @@ public class HosSink extends RichSinkFunction<FileChunk> {
syncHttpClient = HttpClientUtil.getInstance(configuration).getSyncHttpClient(); syncHttpClient = HttpClientUtil.getInstance(configuration).getSyncHttpClient();
} }
timestamp = System.currentTimeMillis(); timestamp = System.currentTimeMillis();
if (configuration.get(Configs.SINK_BATCH)) { batchSize = configuration.getLong(Configs.SINK_HOS_BATCH_SIZE);
batchInterval = configuration.getInteger(Configs.SINK_HOS_BATCH_INTERVAL_MS);
if (batchSize > 0 && batchInterval > 0) {
bathPutUrl = URLUtil.normalize(endpoint + "/hos/" + configuration.get(Configs.SINK_HOS_BUCKET) + "/" + PublicUtil.getUUID()) + "?multiFile"; bathPutUrl = URLUtil.normalize(endpoint + "/hos/" + configuration.get(Configs.SINK_HOS_BUCKET) + "/" + PublicUtil.getUUID()) + "?multiFile";
maxBatchSize = configuration.getLong(Configs.SINK_BATCH_SIZE);
maxBatchCount = configuration.getInteger(Configs.SINK_BATCH_COUNT);
hosMessage = new HashMap<>(); hosMessage = new HashMap<>();
byteList = new ArrayList<>(); byteList = new ArrayList<>();
objectsMeta = ""; objectsMeta = "";
objectsOffset = ""; objectsOffset = "";
chunkSize = 0; chunkSize = 0;
executorService = Executors.newScheduledThreadPool(1); executorService = Executors.newScheduledThreadPool(1);
long period = configuration.getInteger(Configs.SINK_BATCH_TIME);
executorService.scheduleWithFixedDelay(() -> { executorService.scheduleWithFixedDelay(() -> {
if (System.currentTimeMillis() - timestamp > (period * 1000)) {
if (!byteList.isEmpty()) {
synchronized (this) { synchronized (this) {
if (!byteList.isEmpty()) {
sendBatchData(); sendBatchData();
} }
} }
} }, batchInterval, batchInterval, TimeUnit.MILLISECONDS);
}, period, period, TimeUnit.SECONDS);
} }
if (rateLimitThreshold > 0) { if (rateLimitThreshold > 0) {
rateLimitThreshold = configuration.getLong(Configs.SINK_RATE_LIMIT_THRESHOLD); rateLimitThreshold = configuration.getLong(Configs.SINK_RATE_LIMIT_THRESHOLD);
@@ -228,7 +225,6 @@ public class HosSink extends RichSinkFunction<FileChunk> {
sendFileChunk(fileChunk); sendFileChunk(fileChunk);
} }
} else { } else {
timestamp = currentTimeMillis;
sendFileChunk(fileChunk); sendFileChunk(fileChunk);
} }
} }
@@ -250,7 +246,7 @@ public class HosSink extends RichSinkFunction<FileChunk> {
data = fileChunk.getChunk(); data = fileChunk.getChunk();
} }
long chunkLength = data.length; long chunkLength = data.length;
if (configuration.get(Configs.SINK_BATCH)) { if (batchSize > 0 && batchInterval > 0) {
hosMessage.put(HOS_META_FILE_TYPE, fileChunk.getFileType()); hosMessage.put(HOS_META_FILE_TYPE, fileChunk.getFileType());
hosMessage.put(HOS_META_FILENAME, fileChunk.getUuid()); hosMessage.put(HOS_META_FILENAME, fileChunk.getUuid());
if (COMBINE_MODE_SEEK.equals(fileChunk.getCombineMode())) { if (COMBINE_MODE_SEEK.equals(fileChunk.getCombineMode())) {
@@ -275,7 +271,7 @@ public class HosSink extends RichSinkFunction<FileChunk> {
chunksOutCounter.inc(); chunksOutCounter.inc();
bytesOutCounter.inc(chunkLength); bytesOutCounter.inc(chunkLength);
calculateFileChunkMetrics(fileChunk); calculateFileChunkMetrics(fileChunk);
if (chunkSize >= maxBatchSize || byteList.size() >= maxBatchCount) { if (chunkSize >= batchSize) {
sendBatchData(); sendBatchData();
} }
} else { } else {

View File

@@ -84,14 +84,14 @@ public class OssSinkByCaffeineCache extends RichSinkFunction<FileChunk> {
super.open(parameters); super.open(parameters);
MetricGroup metricGroup = getRuntimeContext().getMetricGroup().addGroup("file_chunk_combiner", "sink_oss"); MetricGroup metricGroup = getRuntimeContext().getMetricGroup().addGroup("file_chunk_combiner", "sink_oss");
endpointList = Arrays.asList(configuration.get(Configs.SINK_OSS_ENDPOINT).split(",")); endpointList = Arrays.asList(configuration.get(Configs.SINK_OSS_ENDPOINT).split(","));
isAsync = configuration.getBoolean(Configs.SINK_OSS_ASYNC); isAsync = configuration.getBoolean(Configs.SINK_ASYNC);
if (isAsync) { if (isAsync) {
asyncHttpClient = HttpClientUtil.getInstance(configuration).getAsyncHttpClient(); asyncHttpClient = HttpClientUtil.getInstance(configuration).getAsyncHttpClient();
asyncHttpClient.start(); asyncHttpClient.start();
} else { } else {
syncHttpClient = HttpClientUtil.getInstance(configuration).getSyncHttpClient(); syncHttpClient = HttpClientUtil.getInstance(configuration).getSyncHttpClient();
} }
cache = CaffeineCacheUtil.getInstance(configuration).getCaffeineCache(); cache = CaffeineCacheUtil.getInstance(configuration.getLong(Configs.SINK_OSS_CACHE_SIZE), configuration.getLong(Configs.SINK_OSS_CACHE_TIME_MS)).getCaffeineCache();
metricGroup.gauge("cacheLength", (Gauge<Long>) () -> cache.estimatedSize()); metricGroup.gauge("cacheLength", (Gauge<Long>) () -> cache.estimatedSize());
lessThan1KBChunksCounter = metricGroup.counter("lessThan1KBChunksCount"); lessThan1KBChunksCounter = metricGroup.counter("lessThan1KBChunksCount");
between1KBAnd5KBChunksCounter = metricGroup.counter("between1KBAnd5KBChunksCount"); between1KBAnd5KBChunksCounter = metricGroup.counter("between1KBAnd5KBChunksCount");

View File

@@ -3,9 +3,7 @@ package com.zdjizhi.utils;
import com.github.benmanes.caffeine.cache.Cache; import com.github.benmanes.caffeine.cache.Cache;
import com.github.benmanes.caffeine.cache.Caffeine; import com.github.benmanes.caffeine.cache.Caffeine;
import com.github.benmanes.caffeine.cache.Weigher; import com.github.benmanes.caffeine.cache.Weigher;
import com.zdjizhi.config.Configs;
import com.zdjizhi.pojo.FileChunk; import com.zdjizhi.pojo.FileChunk;
import org.apache.flink.configuration.Configuration;
import org.checkerframework.checker.index.qual.NonNegative; import org.checkerframework.checker.index.qual.NonNegative;
import org.checkerframework.checker.nullness.qual.NonNull; import org.checkerframework.checker.nullness.qual.NonNull;
@@ -15,23 +13,23 @@ public class CaffeineCacheUtil {
private static CaffeineCacheUtil caffeineCacheUtil = null; private static CaffeineCacheUtil caffeineCacheUtil = null;
private static Cache<String, FileChunk> caffeineCache = null; private static Cache<String, FileChunk> caffeineCache = null;
private CaffeineCacheUtil(Configuration configuration) { private CaffeineCacheUtil(Long cacheSize, Long cacheTimeMs) {
caffeineCache = Caffeine.newBuilder() caffeineCache = Caffeine.newBuilder()
// .initialCapacity(configuration.getLong(Configs.FILE_META_CACHE_SIZE)) // .initialCapacity(configuration.getLong(Configs.SINK_OSS_CACHE_SIZE))
.maximumWeight(configuration.getLong(Configs.FILE_META_CACHE_SIZE)) .maximumWeight(cacheSize)
.weigher(new Weigher<String, FileChunk>() { .weigher(new Weigher<String, FileChunk>() {
@Override @Override
public @NonNegative int weigh(@NonNull String key, @NonNull FileChunk value) { public @NonNegative int weigh(@NonNull String key, @NonNull FileChunk value) {
return (int) value.getLength(); return (int) value.getLength();
} }
}) })
.expireAfterWrite(configuration.get(Configs.FILE_META_CACHE_TIME), TimeUnit.SECONDS) .expireAfterWrite(cacheTimeMs, TimeUnit.MILLISECONDS)
.build(); .build();
} }
public static synchronized CaffeineCacheUtil getInstance(Configuration configuration) { public static synchronized CaffeineCacheUtil getInstance(Long cacheSize, Long cacheTimeMs) {
if (null == caffeineCacheUtil) { if (null == caffeineCacheUtil) {
caffeineCacheUtil = new CaffeineCacheUtil(configuration); caffeineCacheUtil = new CaffeineCacheUtil(cacheSize, cacheTimeMs);
} }
return caffeineCacheUtil; return caffeineCacheUtil;
} }

View File

@@ -22,8 +22,8 @@ public class HBaseConnectionUtil {
hbaseConfiguration.set(HConstants.ZOOKEEPER_QUORUM, configuration.getString(Configs.SINK_HBASE_ZOOKEEPER)); hbaseConfiguration.set(HConstants.ZOOKEEPER_QUORUM, configuration.getString(Configs.SINK_HBASE_ZOOKEEPER));
hbaseConfiguration.set(HConstants.CLIENT_ZOOKEEPER_CLIENT_PORT, "2181"); hbaseConfiguration.set(HConstants.CLIENT_ZOOKEEPER_CLIENT_PORT, "2181");
hbaseConfiguration.set(HConstants.ZOOKEEPER_ZNODE_PARENT, "/hbase"); hbaseConfiguration.set(HConstants.ZOOKEEPER_ZNODE_PARENT, "/hbase");
hbaseConfiguration.set(HConstants.BULKLOAD_MAX_RETRIES_NUMBER, configuration.get(Configs.SINK_HBASE_RETRIES_NUMBER) + ""); hbaseConfiguration.set(HConstants.HBASE_CLIENT_RETRIES_NUMBER, configuration.get(Configs.SINK_HBASE_RETRIES_NUMBER) + "");
hbaseConfiguration.set(HConstants.HBASE_RPC_TIMEOUT_KEY, configuration.get(Configs.SINK_HBASE_RPC_TIMEOUT) + ""); hbaseConfiguration.set(HConstants.HBASE_RPC_TIMEOUT_KEY, configuration.get(Configs.SINK_HBASE_RPC_TIMEOUT_MS) + "");
hbaseConfiguration.set(ConnectionConfiguration.MAX_KEYVALUE_SIZE_KEY, "1073741800"); hbaseConfiguration.set(ConnectionConfiguration.MAX_KEYVALUE_SIZE_KEY, "1073741800");
hbaseConfiguration.set(ConnectionConfiguration.WRITE_BUFFER_SIZE_KEY, configuration.get(Configs.SINK_HBASE_CLIENT_WRITE_BUFFER) + ""); hbaseConfiguration.set(ConnectionConfiguration.WRITE_BUFFER_SIZE_KEY, configuration.get(Configs.SINK_HBASE_CLIENT_WRITE_BUFFER) + "");
hbaseConfiguration.set(HConstants.HBASE_CLIENT_IPC_POOL_SIZE, configuration.get(Configs.SINK_HBASE_CLIENT_IPC_POOL_SIZE) + ""); hbaseConfiguration.set(HConstants.HBASE_CLIENT_IPC_POOL_SIZE, configuration.get(Configs.SINK_HBASE_CLIENT_IPC_POOL_SIZE) + "");
@@ -51,6 +51,7 @@ public class HBaseConnectionUtil {
try { try {
asyncHBaseConnection = ConnectionFactory.createAsyncConnection(hbaseConfiguration).get(); asyncHBaseConnection = ConnectionFactory.createAsyncConnection(hbaseConfiguration).get();
} catch (ExecutionException | InterruptedException e) { } catch (ExecutionException | InterruptedException e) {
Thread.currentThread().interrupt();
throw new RuntimeException(e.getMessage()); throw new RuntimeException(e.getMessage());
} }
return asyncHBaseConnection; return asyncHBaseConnection;

View File

@@ -44,15 +44,15 @@ public class HttpClientUtil {
private RequestConfig getRequestConfig() { private RequestConfig getRequestConfig() {
return RequestConfig.custom() return RequestConfig.custom()
.setConnectTimeout(configuration.get(Configs.SINK_HTTP_CONNECT_TIMEOUT)) .setConnectTimeout(configuration.get(Configs.SINK_HTTP_CLIENT_CONNECT_TIMEOUT_MS))
.setConnectionRequestTimeout(configuration.get(Configs.SINK_HTTP_REQUEST_TIMEOUT)) .setConnectionRequestTimeout(configuration.get(Configs.SINK_HTTP_CLIENT_REQUEST_TIMEOUT_MS))
.setSocketTimeout(configuration.get(Configs.SINK_HTTP_SOCKET_TIMEOUT)) .setSocketTimeout(configuration.get(Configs.SINK_HTTP_CLIENT_SOCKET_TIMEOUT_MS))
.build(); .build();
} }
private HttpRequestRetryHandler getRetryHandler() { private HttpRequestRetryHandler getRetryHandler() {
return (exception, executionCount, context) -> { return (exception, executionCount, context) -> {
if (executionCount >= configuration.get(Configs.SINK_HTTP_ERROR_RETRY)) { if (executionCount >= configuration.get(Configs.SINK_HTTP_CLIENT_RETRIES_NUMBER)) {
return false; return false;
} }
if (exception instanceof NoHttpResponseException) {// 如果服务器丢掉了连接,那么就重试 if (exception instanceof NoHttpResponseException) {// 如果服务器丢掉了连接,那么就重试
@@ -106,9 +106,9 @@ public class HttpClientUtil {
// 创建ConnectionManager添加Connection配置信息 // 创建ConnectionManager添加Connection配置信息
connManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry); connManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);
// 设置最大连接数 // 设置最大连接数
connManager.setMaxTotal(configuration.get(Configs.SINK_HTTP_MAX_TOTAL)); connManager.setMaxTotal(configuration.get(Configs.SINK_HTTP_CLIENT_MAX_TOTAL));
// 设置每个连接的路由数 // 设置每个连接的路由数
connManager.setDefaultMaxPerRoute(configuration.get(Configs.SINK_HTTP_MAX_PER_ROUTE)); connManager.setDefaultMaxPerRoute(configuration.get(Configs.SINK_HTTP_CLIENT_MAX_PER_ROUTE));
} catch (KeyManagementException | NoSuchAlgorithmException e) { } catch (KeyManagementException | NoSuchAlgorithmException e) {
throw new RuntimeException(e.getMessage()); throw new RuntimeException(e.getMessage());
} }
@@ -146,8 +146,8 @@ public class HttpClientUtil {
.build(); .build();
ConnectingIOReactor ioReactor = new DefaultConnectingIOReactor(ioReactorConfig); ConnectingIOReactor ioReactor = new DefaultConnectingIOReactor(ioReactorConfig);
connManager = new PoolingNHttpClientConnectionManager(ioReactor); connManager = new PoolingNHttpClientConnectionManager(ioReactor);
connManager.setMaxTotal(configuration.get(Configs.SINK_HTTP_MAX_TOTAL)); connManager.setMaxTotal(configuration.get(Configs.SINK_HTTP_CLIENT_MAX_TOTAL));
connManager.setDefaultMaxPerRoute(configuration.get(Configs.SINK_HTTP_MAX_PER_ROUTE)); connManager.setDefaultMaxPerRoute(configuration.get(Configs.SINK_HTTP_CLIENT_MAX_PER_ROUTE));
} catch (IOReactorException e) { } catch (IOReactorException e) {
throw new RuntimeException(e.getMessage()); throw new RuntimeException(e.getMessage());
} }

View File

@@ -1,70 +1,76 @@
flink.job.name=agg_traffic_file_chunk_combine flink.job.name=agg_traffic_file_chunk_combine
#9092Ϊ<32><CEAA><EFBFBD><EFBFBD>֤ 9095Ϊssl 9094Ϊsasl
source.kafka.broker=192.168.41.29:9092 #kafka source配置
#9092为无验证 9095为ssl 9094为sasl
source.kafka.broker=192.168.44.12:9092
source.kafka.topic=TRAFFIC-FILE-STREAM-RECORD source.kafka.topic=TRAFFIC-FILE-STREAM-RECORD
source.kafka.group.id=test1 source.kafka.group.id=test
#earliest<EFBFBD><EFBFBD>ͷ<EFBFBD><EFBFBD>ʼ latest<EFBFBD><EFBFBD><EFBFBD><EFBFBD> #earliest从头开始 latest最新
source.kafka.auto.offset.reset=earliest source.kafka.auto.offset.reset=latest
source.kafka.session.timeout.ms=60000 source.kafka.session.timeout.ms=60000
#ÿ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ӷ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>л<EFBFBD>ȡ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>¼<EFBFBD><EFBFBD> #每次拉取操作从分区中获取的最大记录数
source.kafka.max.poll.records=1000 source.kafka.max.poll.records=1000
#<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ߴӵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><EFBFBD><EFBFBD>Ի<EFBFBD>ȡ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֽ<EFBFBD><EFBFBD><EFBFBD> #消费者从单个分区中一次性获取的最大字节数
source.kafka.max.partition.fetch.bytes=31457280 source.kafka.max.partition.fetch.bytes=31457280
source.kafka.enable.auto.commit=true source.kafka.enable.auto.commit=true
#kafka SASL<EFBFBD><EFBFBD>֤<EFBFBD>û<EFBFBD><EFBFBD><EFBFBD> #kafka SASL验证用户名
source.kafka.user=admin source.kafka.user=admin
#kafka SASL<EFBFBD><EFBFBD>SSL<EFBFBD><EFBFBD>֤<EFBFBD><EFBFBD><EFBFBD><EFBFBD> #kafka SASLSSL验证密码
source.kafka.pin=galaxy2019 source.kafka.pin=galaxy2019
#SSL<EFBFBD><EFBFBD>Ҫ #SSL需要
source.kafka.tools.library=D:\\K18-Phase2\\tsgSpace\\dat\\tsg\\ source.kafka.tools.library=/opt/tsg/olap/topology/data/
source.kafka.file.meta.session.topic=SESSION-RECORD
source.kafka.file.meta.proxy.topic=PROXY-EVENT map.filter.expression=FileChunk.offset <= 1073741824
source.kafka.file.meta.group.id=file_chunk_combine_1
map.filter.expression=FileChunk.fileType == "eml" || (FileChunk.offset <= 10737 && FileChunk.fileType != "eml") #窗口相关配置
map.parse.file.meta.parallelism=1
#<23><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
combiner.window.type=0
combiner.window.parallelism=1 combiner.window.parallelism=1
combiner.window.time=10 #窗口大小,单位秒
combiner.window.allowed.lateness=10 combiner.window.size=30
#<EFBFBD>೤ʱ<EFBFBD><EFBFBD>δд<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>򴥷<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> #combiner.window.enable.last.chunk.trigger=true
combiner.window.idle.time=10
combiner.window.enable.last.chunk.trigger=true #sink相关参数
file.max.chunk.count=100000
file.meta.cache.time=30
file.meta.cache.size=1073741824
#file.meta.filter.expression=(FileChunk.meta.sled_ip == "172.18.10.168" && FileChunk.meta.fileId.contains("_9")) || (FileChunk.meta.sled_ip == "172.18.10.168" && FileChunk.meta.duration_ms < 60 && FileChunk.meta.contentLength < 1048576)
sink.parallelism=1 sink.parallelism=1
#<EFBFBD><EFBFBD>ѡhos<EFBFBD><EFBFBD>oss<EFBFBD><EFBFBD>hbase #可选hososshbase
sink.type=hos sink.type=hos
sink.async=false sink.async=true
sink.batch=true
sink.batch.count=1000
sink.batch.size=1048576
sink.batch.time=10
#sink.filter.expression= #sink.filter.expression=
#限流配置
#sink.rate.limit.threshold=0 #sink.rate.limit.threshold=0
#sink.rate.limit.exclusion.expression=FileChunk.fileType == "eml" #sink.rate.limit.exclusion.expression=FileChunk.fileType == "eml"
#hos sink<6E><6B><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
#<EFBFBD><EFBFBD><EFBFBD><EFBFBD>nginx<EFBFBD>򵥸<EFBFBD>hos<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϊip:port<72><74><EFBFBD><EFBFBD><EFBFBD>ʶ<EFBFBD><CAB6><EFBFBD>hos<6F><73><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϊip1:port,ip2:port... #hos sink相关配置
sink.hos.endpoint=192.168.41.29:8186 #访问nginx或单个hos配置为ip:port访问多个hos配置为ip1:port,ip2:port...
sink.hos.endpoint=192.168.44.12:8186
sink.hos.bucket=traffic_file_bucket sink.hos.bucket=traffic_file_bucket
sink.hos.token=c21f969b5f03d33d43e04f8f136e7682 sink.hos.token=c21f969b5f03d33d43e04f8f136e7682
#oss sink<6E><6B><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> #sink.hos.batch.size=1048576
#<EFBFBD><EFBFBD><EFBFBD>Զ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ַ<EFBFBD><EFBFBD>ip1:port,ip2:port... #sink.hos.batch.interval.ms=10000
sink.oss.endpoint=192.168.41.29:8186
#sink.oss.filter.expression=FileChunk.offset == 0 && FileChunk.lastChunkFlag == 1 #http相关配置
sink.oss.async=false sink.http.client.retries.number=3
#http <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> sink.http.client.max.total=100
sink.http.error.retry=3 sink.http.client.max.per.route=100
sink.http.max.total=10 sink.http.client.connect.timeout.ms=10000
sink.http.max.per.route=10 sink.http.client.request.timeout.ms=10000
sink.http.connect.timeout=1000 sink.http.client.socket.timeout.ms=60000
sink.http.request.timeout=5000
sink.http.socket.timeout=60000 #hbase sink相关配置
#hbase sink<6E><6B><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> sink.hbase.zookeeper=192.168.44.12
sink.hbase.zookeeper=192.168.41.29
sink.hbase.retries.number=10 sink.hbase.retries.number=10
sink.hbase.rpc.timeout=600000 sink.hbase.rpc.timeout.ms=600000
sink.hbase.client.write.buffer=10971520 sink.hbase.client.write.buffer=10971520
sink.hbase.client.ipc.pool.size=3 sink.hbase.client.ipc.pool.size=3
sink.hbase.batch.size=1048576
sink.hbase.batch.interval.ms=10000
#oss sink相关配置
#可以多个地址ip1:port,ip2:port...
#sink.oss.endpoint=192.168.44.12:8186
#sink.oss.filter.expression=FileChunk.offset == 0 && FileChunk.lastChunkFlag == 1
#sink.oss.cache.time.ms=30000
#sink.oss.cache.size=1073741824
#文件关联元信息相关配置
#source.kafka.file.meta.session.topic=SESSION-RECORD
#source.kafka.file.meta.proxy.topic=PROXY-EVENT
#source.kafka.file.meta.group.id=file_chunk_combine_1
#file.meta.filter.expression=FileChunk.meta.fileId.contains("_9")

View File

@@ -6,18 +6,14 @@ import cn.hutool.core.util.RandomUtil;
import com.zdjizhi.config.Configs; import com.zdjizhi.config.Configs;
import com.zdjizhi.function.*; import com.zdjizhi.function.*;
import com.zdjizhi.function.map.ParseMessagePackMapFunction; import com.zdjizhi.function.map.ParseMessagePackMapFunction;
import com.zdjizhi.function.map.SideOutputMapFunction;
import com.zdjizhi.pojo.FileChunk; import com.zdjizhi.pojo.FileChunk;
import com.zdjizhi.sink.HBaseSink; import com.zdjizhi.sink.HBaseSink;
import com.zdjizhi.sink.HosSink; import com.zdjizhi.sink.HosSink;
import com.zdjizhi.trigger.LastChunkOrNoDataInTimeTrigger; import com.zdjizhi.trigger.LastChunkOrNoDataInTimeTrigger;
import com.zdjizhi.trigger.LastChunkTrigger;
import com.zdjizhi.trigger.MultipleTrigger; import com.zdjizhi.trigger.MultipleTrigger;
import org.apache.flink.api.common.ExecutionConfig; import com.zdjizhi.utils.PublicUtil;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo; import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.typeutils.PojoTypeInfo;
import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration;
@@ -26,39 +22,30 @@ import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.SinkFunction; import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction; import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.operators.*; import org.apache.flink.streaming.api.operators.*;
import org.apache.flink.streaming.api.transformations.OneInputTransformation; import org.apache.flink.streaming.api.transformations.OneInputTransformation;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows; import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time; import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.triggers.EventTimeTrigger; import org.apache.flink.streaming.api.windowing.triggers.EventTimeTrigger;
import org.apache.flink.streaming.api.windowing.triggers.ProcessingTimeTrigger;
import org.apache.flink.streaming.api.windowing.triggers.Trigger; import org.apache.flink.streaming.api.windowing.triggers.Trigger;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow; import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.runtime.operators.windowing.WindowOperator; import org.apache.flink.streaming.runtime.operators.windowing.WindowOperator;
import org.apache.flink.streaming.runtime.operators.windowing.functions.InternalIterableProcessWindowFunction;
import org.apache.flink.streaming.runtime.operators.windowing.functions.InternalWindowFunction;
import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
import org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness; import org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness;
import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness;
import org.apache.flink.streaming.util.TestHarnessUtil; import org.apache.flink.streaming.util.TestHarnessUtil;
import org.apache.flink.streaming.util.functions.StreamingFunctionUtils;
import org.apache.flink.test.util.MiniClusterWithClientResource; import org.apache.flink.test.util.MiniClusterWithClientResource;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag; import org.apache.flink.util.OutputTag;
import org.junit.*; import org.junit.*;
import org.mockito.Mockito;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
import java.io.*; import java.io.*;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.time.Duration;
import java.util.*; import java.util.*;
import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ConcurrentLinkedQueue;
import static com.zdjizhi.utils.PublicConstants.COMBINE_MODE_APPEND;
public class FileChunkCombinerTests { public class FileChunkCombinerTests {
private File emlFile; private File emlFile;
private byte[] emlFileBytes; private byte[] emlFileBytes;
@@ -111,7 +98,7 @@ public class FileChunkCombinerTests {
triggers.add(EventTimeTrigger.create()); triggers.add(EventTimeTrigger.create());
triggers.add(LastChunkOrNoDataInTimeTrigger.of(1000)); triggers.add(LastChunkOrNoDataInTimeTrigger.of(1000));
Trigger<Object, TimeWindow> trigger = MultipleTrigger.of(triggers); Trigger<Object, TimeWindow> trigger = MultipleTrigger.of(triggers);
processWindowFunction = new CombineChunkProcessWindowFunction(Integer.MAX_VALUE); processWindowFunction = new CombineChunkProcessWindowFunction();
delayedChunkOutputTag = new OutputTag<FileChunk>("delayed-chunk") { delayedChunkOutputTag = new OutputTag<FileChunk>("delayed-chunk") {
}; };
DataStreamSource<FileChunk> source = env.fromCollection(inputFileChunks); DataStreamSource<FileChunk> source = env.fromCollection(inputFileChunks);
@@ -165,34 +152,6 @@ public class FileChunkCombinerTests {
testHarness.close(); testHarness.close();
} }
@Test
public void testSideOutputMapFunction() throws Exception {
SideOutputMapFunction sideOutputMapFunction = new SideOutputMapFunction();
OneInputStreamOperatorTestHarness<FileChunk, FileChunk> testHarness = new OneInputStreamOperatorTestHarness<>(new StreamMap<>(sideOutputMapFunction));
testHarness.setup();
testHarness.open();
for (FileChunk fileChunk : inputFileChunks) {
testHarness.processElement(new StreamRecord<>(fileChunk));
}
ConcurrentLinkedQueue<Object> expectedOutput = new ConcurrentLinkedQueue<>();
for (FileChunk fileChunk : inputFileChunks) {
fileChunk.setChunkCount(1);
if (COMBINE_MODE_APPEND.equals(fileChunk.getCombineMode())) {
fileChunk.setChunkNumbers(fileChunk.getTimestamp() + "-" + fileChunk.getChunk().length + ";");
}
expectedOutput.add(new StreamRecord<>(fileChunk));
}
ConcurrentLinkedQueue<Object> actualOutput = testHarness.getOutput();
Assert.assertEquals(30, actualOutput.size());
TestHarnessUtil.assertOutputEqualsSorted("Output was not correct.", expectedOutput, actualOutput, (o1, o2) -> {
StreamRecord sr0 = (StreamRecord) o1;
StreamRecord sr1 = (StreamRecord) o2;
return ((FileChunk) sr0.getValue()).getUuid().compareTo(((FileChunk) sr1.getValue()).getUuid());
});
Assert.assertEquals(30, sideOutputMapFunction.delayedChunksCounter.getCount());
testHarness.close();
}
@Test @Test
public void testFileChunkFilterFunction() throws Exception { public void testFileChunkFilterFunction() throws Exception {
FileChunkFilterFunction fileChunkFilterFunction = new FileChunkFilterFunction("FileChunk.fileType == \"eml\"", "test"); FileChunkFilterFunction fileChunkFilterFunction = new FileChunkFilterFunction("FileChunk.fileType == \"eml\"", "test");
@@ -331,11 +290,13 @@ public class FileChunkCombinerTests {
testHarness.close(); testHarness.close();
} }
//测试hos sink需配置可用的hos地址
@Test @Test
public void testHosSink() throws Exception { public void testHosSink() throws Exception {
//测试单条上传 //测试单条上传
configuration.setString(Configs.SINK_TYPE, "hos"); configuration.setString(Configs.SINK_TYPE, "hos");
configuration.setBoolean(Configs.SINK_BATCH, false); configuration.setLong(Configs.SINK_HOS_BATCH_SIZE, 0L);
configuration.setInteger(Configs.SINK_HOS_BATCH_INTERVAL_MS, 0);
HosSink hosSink = new HosSink(configuration); HosSink hosSink = new HosSink(configuration);
StreamSink<FileChunk> fileChunkStreamSink = new StreamSink<>(hosSink); StreamSink<FileChunk> fileChunkStreamSink = new StreamSink<>(hosSink);
OneInputStreamOperatorTestHarness<FileChunk, Object> testHarness = new OneInputStreamOperatorTestHarness<>(fileChunkStreamSink); OneInputStreamOperatorTestHarness<FileChunk, Object> testHarness = new OneInputStreamOperatorTestHarness<>(fileChunkStreamSink);
@@ -343,7 +304,7 @@ public class FileChunkCombinerTests {
testHarness.open(); testHarness.open();
byte[] data = RandomUtil.randomString(1000).getBytes(); byte[] data = RandomUtil.randomString(1000).getBytes();
//seek文件 //seek文件
FileChunk fileChunk = new FileChunk("0000000001", "eml", 0, data.length, data, "seek", 1, 5, System.currentTimeMillis() * 1000); FileChunk fileChunk = new FileChunk(PublicUtil.getUUID(), "eml", 0, data.length, data, "seek", 1, 5, System.currentTimeMillis() * 1000);
testHarness.processElement(new StreamRecord<>(fileChunk)); testHarness.processElement(new StreamRecord<>(fileChunk));
Assert.assertEquals(1, hosSink.chunksInCounter.getCount()); Assert.assertEquals(1, hosSink.chunksInCounter.getCount());
Assert.assertEquals(1, hosSink.chunksOutCounter.getCount()); Assert.assertEquals(1, hosSink.chunksOutCounter.getCount());
@@ -356,7 +317,7 @@ public class FileChunkCombinerTests {
Assert.assertEquals(0, hosSink.between100KBAnd1MBChunksCounter.getCount()); Assert.assertEquals(0, hosSink.between100KBAnd1MBChunksCounter.getCount());
Assert.assertEquals(0, hosSink.greaterThan1MBChunksCounter.getCount()); Assert.assertEquals(0, hosSink.greaterThan1MBChunksCounter.getCount());
//append文件 //append文件
fileChunk = new FileChunk("0000000002", "pcapng", data.length, data, "append", 5, System.currentTimeMillis() * 1000, pcapngFileMeta, "1-200,2-200,3-200,4-200,5-200"); fileChunk = new FileChunk(PublicUtil.getUUID(), "pcapng", data.length, data, "append", 5, System.currentTimeMillis() * 1000, pcapngFileMeta, "1-200,2-200,3-200,4-200,5-200");
testHarness.processElement(new StreamRecord<>(fileChunk)); testHarness.processElement(new StreamRecord<>(fileChunk));
Assert.assertEquals(2, hosSink.chunksInCounter.getCount()); Assert.assertEquals(2, hosSink.chunksInCounter.getCount());
Assert.assertEquals(2, hosSink.chunksOutCounter.getCount()); Assert.assertEquals(2, hosSink.chunksOutCounter.getCount());
@@ -373,19 +334,18 @@ public class FileChunkCombinerTests {
//测试批量上传 //测试批量上传
data = RandomUtil.randomString(10000).getBytes(); data = RandomUtil.randomString(10000).getBytes();
configuration.setString(Configs.SINK_TYPE, "hos"); configuration.setString(Configs.SINK_TYPE, "hos");
configuration.setBoolean(Configs.SINK_BATCH, true); configuration.setLong(Configs.SINK_HOS_BATCH_SIZE, 1024*1024L);
configuration.setInteger(Configs.SINK_BATCH_COUNT, 10); configuration.setInteger(Configs.SINK_HOS_BATCH_INTERVAL_MS, 2000);
configuration.setInteger(Configs.SINK_BATCH_TIME, 2);
hosSink = new HosSink(configuration); hosSink = new HosSink(configuration);
fileChunkStreamSink = new StreamSink<>(hosSink); fileChunkStreamSink = new StreamSink<>(hosSink);
testHarness = new OneInputStreamOperatorTestHarness<>(fileChunkStreamSink); testHarness = new OneInputStreamOperatorTestHarness<>(fileChunkStreamSink);
testHarness.setup(); testHarness.setup();
testHarness.open(); testHarness.open();
fileChunk = new FileChunk("0000000001", "eml", 0, data.length, data, "seek", 1, 5, System.currentTimeMillis() * 1000); fileChunk = new FileChunk(PublicUtil.getUUID(), "eml", 0, data.length, data, "seek", 1, 5, System.currentTimeMillis() * 1000);
testHarness.processElement(new StreamRecord<>(fileChunk)); testHarness.processElement(new StreamRecord<>(fileChunk));
fileChunk = new FileChunk("0000000002", "eml", 0, data.length, data, "seek", 1, 5, System.currentTimeMillis() * 1000); fileChunk = new FileChunk(PublicUtil.getUUID(), "eml", 0, data.length, data, "seek", 1, 5, System.currentTimeMillis() * 1000);
testHarness.processElement(new StreamRecord<>(fileChunk)); testHarness.processElement(new StreamRecord<>(fileChunk));
Thread.sleep(configuration.getInteger(Configs.SINK_BATCH_TIME) * 1000L + 1000); Thread.sleep(configuration.getInteger(Configs.SINK_HOS_BATCH_INTERVAL_MS) + 1000);
Assert.assertEquals(2, hosSink.chunksInCounter.getCount()); Assert.assertEquals(2, hosSink.chunksInCounter.getCount());
Assert.assertEquals(2, hosSink.chunksOutCounter.getCount()); Assert.assertEquals(2, hosSink.chunksOutCounter.getCount());
Assert.assertEquals(0, hosSink.errorChunksCounter.getCount()); Assert.assertEquals(0, hosSink.errorChunksCounter.getCount());
@@ -399,23 +359,48 @@ public class FileChunkCombinerTests {
testHarness.close(); testHarness.close();
} }
//测试hbase sink需配置可用的hbase地址
@Test @Test
public void testHBaseSink() throws Exception { public void testHBaseSink() throws Exception {
//测试单条上传
configuration.setString(Configs.SINK_TYPE, "hbase"); configuration.setString(Configs.SINK_TYPE, "hbase");
configuration.setBoolean(Configs.SINK_BATCH, true); configuration.setLong(Configs.SINK_HBASE_BATCH_SIZE, 0L);
configuration.setInteger(Configs.SINK_BATCH_COUNT, 10); configuration.setInteger(Configs.SINK_HBASE_BATCH_INTERVAL_MS, 0);
configuration.setInteger(Configs.SINK_BATCH_TIME, 2);
HBaseSink hBaseSink = new HBaseSink(configuration); HBaseSink hBaseSink = new HBaseSink(configuration);
StreamSink<FileChunk> fileChunkStreamSink = new StreamSink<>(hBaseSink); StreamSink<FileChunk> fileChunkStreamSink = new StreamSink<>(hBaseSink);
OneInputStreamOperatorTestHarness<FileChunk, Object> testHarness = new OneInputStreamOperatorTestHarness<>(fileChunkStreamSink); OneInputStreamOperatorTestHarness<FileChunk, Object> testHarness = new OneInputStreamOperatorTestHarness<>(fileChunkStreamSink);
testHarness.setup(); testHarness.setup();
testHarness.open(); testHarness.open();
byte[] data = RandomUtil.randomString(1000).getBytes(); byte[] data = RandomUtil.randomString(1000).getBytes();
FileChunk fileChunk = new FileChunk("0000000001", "eml", 0, data.length, data, "seek", 1, 5, System.currentTimeMillis()); FileChunk fileChunk = new FileChunk(PublicUtil.getUUID(), "eml", 0, data.length, data, "seek", 1, 5, System.currentTimeMillis());
testHarness.processElement(new StreamRecord<>(fileChunk)); testHarness.processElement(new StreamRecord<>(fileChunk));
fileChunk = new FileChunk("0000000002", "eml", 0, data.length, data, "seek", 1, 5, System.currentTimeMillis()); Assert.assertEquals(1, hBaseSink.chunksInCounter.getCount());
Assert.assertEquals(1, hBaseSink.chunksOutCounter.getCount());
Assert.assertEquals(0, hBaseSink.errorChunksCounter.getCount());
Assert.assertEquals(1, hBaseSink.filesCounter.getCount());
Assert.assertEquals(1, hBaseSink.lessThan1KBChunksCounter.getCount());
Assert.assertEquals(0, hBaseSink.between1KBAnd5KBChunksCounter.getCount());
Assert.assertEquals(0, hBaseSink.between5KBAnd10KBChunksCounter.getCount());
Assert.assertEquals(0, hBaseSink.between10KBAnd100KBChunksCounter.getCount());
Assert.assertEquals(0, hBaseSink.between100KBAnd1MBChunksCounter.getCount());
Assert.assertEquals(0, hBaseSink.greaterThan1MBChunksCounter.getCount());
testHarness.close();
//测试批量上传
configuration.setString(Configs.SINK_TYPE, "hbase");
configuration.setLong(Configs.SINK_HBASE_BATCH_SIZE, 1024*1024L);
configuration.setInteger(Configs.SINK_HBASE_BATCH_INTERVAL_MS, 2000);
hBaseSink = new HBaseSink(configuration);
fileChunkStreamSink = new StreamSink<>(hBaseSink);
testHarness = new OneInputStreamOperatorTestHarness<>(fileChunkStreamSink);
testHarness.setup();
testHarness.open();
data = RandomUtil.randomString(1000).getBytes();
fileChunk = new FileChunk(PublicUtil.getUUID(), "eml", 0, data.length, data, "seek", 1, 5, System.currentTimeMillis());
testHarness.processElement(new StreamRecord<>(fileChunk)); testHarness.processElement(new StreamRecord<>(fileChunk));
Thread.sleep(configuration.getInteger(Configs.SINK_BATCH_TIME) * 1000L + 1000); fileChunk = new FileChunk(PublicUtil.getUUID(), "eml", 0, data.length, data, "seek", 1, 5, System.currentTimeMillis());
testHarness.processElement(new StreamRecord<>(fileChunk));
Thread.sleep(configuration.getInteger(Configs.SINK_HBASE_BATCH_INTERVAL_MS) + 1000);
Assert.assertEquals(2, hBaseSink.chunksInCounter.getCount()); Assert.assertEquals(2, hBaseSink.chunksInCounter.getCount());
Assert.assertEquals(2, hBaseSink.chunksOutCounter.getCount()); Assert.assertEquals(2, hBaseSink.chunksOutCounter.getCount());
Assert.assertEquals(0, hBaseSink.errorChunksCounter.getCount()); Assert.assertEquals(0, hBaseSink.errorChunksCounter.getCount());
@@ -633,21 +618,20 @@ public class FileChunkCombinerTests {
private StreamExecutionEnvironment createPipeline(int parallelism, SourceFunction<byte[]> source, long windowTime, long windowIdleTime) { private StreamExecutionEnvironment createPipeline(int parallelism, SourceFunction<byte[]> source, long windowTime, long windowIdleTime) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(parallelism); env.setParallelism(parallelism);
WatermarkStrategy<FileChunk> watermarkStrategy = WatermarkStrategy
.<FileChunk>forBoundedOutOfOrderness(Duration.ofSeconds(0))
.withTimestampAssigner((FileChunk, timestamp) -> FileChunk.getTimestamp() / 1000);
List<Trigger<Object, TimeWindow>> triggers = new ArrayList<>(); List<Trigger<Object, TimeWindow>> triggers = new ArrayList<>();
triggers.add(EventTimeTrigger.create()); triggers.add(ProcessingTimeTrigger.create());
triggers.add(LastChunkOrNoDataInTimeTrigger.of(windowIdleTime * 1000)); if (configuration.get(Configs.COMBINER_WINDOW_ENABLE_LAST_CHUNK_TRIGGER)) {
triggers.add(LastChunkTrigger.create());
}
Trigger<Object, TimeWindow> trigger = MultipleTrigger.of(triggers); Trigger<Object, TimeWindow> trigger = MultipleTrigger.of(triggers);
env.addSource(source) env.addSource(source)
.map(new ParseMessagePackMapFunction()) .map(new ParseMessagePackMapFunction())
.filter(new FileChunkFilterFunction("", "test")) .filter(new FileChunkFilterFunction("", "test"))
.assignTimestampsAndWatermarks(watermarkStrategy)
.keyBy(new FileChunkKeySelector(), BasicTypeInfo.STRING_TYPE_INFO) .keyBy(new FileChunkKeySelector(), BasicTypeInfo.STRING_TYPE_INFO)
.window(TumblingEventTimeWindows.of(Time.seconds(windowTime))) .window(TumblingProcessingTimeWindows.of(Time.seconds(windowTime)))
.trigger(trigger) .trigger(trigger)
.process(new CombineChunkProcessWindowFunction(Integer.MAX_VALUE)) .process(new CombineChunkProcessWindowFunction())
.addSink(new CollectSink()); .addSink(new CollectSink());
return env; return env;
} }