2024-01-22 17:33:39 +08:00
package com.zdjizhi ;
import cn.hutool.core.util.ArrayUtil ;
import cn.hutool.core.util.RandomUtil ;
import com.zdjizhi.config.Configs ;
2024-03-13 10:37:11 +08:00
import com.zdjizhi.function.* ;
2024-05-15 15:38:29 +08:00
import com.zdjizhi.function.map.ParseMessagePackMapFunction ;
2024-01-22 17:33:39 +08:00
import com.zdjizhi.pojo.FileChunk ;
2024-03-05 17:26:52 +08:00
import com.zdjizhi.sink.HBaseSink ;
import com.zdjizhi.sink.HosSink ;
2024-01-22 17:33:39 +08:00
import com.zdjizhi.trigger.LastChunkOrNoDataInTimeTrigger ;
2024-07-08 10:07:07 +08:00
import com.zdjizhi.trigger.LastChunkTrigger ;
2024-01-22 17:33:39 +08:00
import com.zdjizhi.trigger.MultipleTrigger ;
2024-07-08 10:07:07 +08:00
import com.zdjizhi.utils.PublicUtil ;
2024-01-22 17:33:39 +08:00
import org.apache.flink.api.common.typeinfo.BasicTypeInfo ;
import org.apache.flink.api.java.utils.ParameterTool ;
import org.apache.flink.configuration.Configuration ;
import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration ;
2024-03-13 10:37:11 +08:00
import org.apache.flink.streaming.api.datastream.DataStream ;
import org.apache.flink.streaming.api.datastream.DataStreamSource ;
2024-01-22 17:33:39 +08:00
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment ;
import org.apache.flink.streaming.api.functions.sink.SinkFunction ;
import org.apache.flink.streaming.api.functions.source.SourceFunction ;
2024-03-05 17:26:52 +08:00
import org.apache.flink.streaming.api.operators.* ;
2024-03-13 10:37:11 +08:00
import org.apache.flink.streaming.api.transformations.OneInputTransformation ;
2024-01-22 17:33:39 +08:00
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows ;
2024-07-08 10:07:07 +08:00
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows ;
2024-01-22 17:33:39 +08:00
import org.apache.flink.streaming.api.windowing.time.Time ;
import org.apache.flink.streaming.api.windowing.triggers.EventTimeTrigger ;
2024-07-08 10:07:07 +08:00
import org.apache.flink.streaming.api.windowing.triggers.ProcessingTimeTrigger ;
2024-01-22 17:33:39 +08:00
import org.apache.flink.streaming.api.windowing.triggers.Trigger ;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow ;
import org.apache.flink.streaming.runtime.operators.windowing.WindowOperator ;
import org.apache.flink.streaming.runtime.streamrecord.StreamRecord ;
import org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness ;
2024-03-05 17:26:52 +08:00
import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness ;
2024-01-22 17:33:39 +08:00
import org.apache.flink.streaming.util.TestHarnessUtil ;
import org.apache.flink.test.util.MiniClusterWithClientResource ;
2024-03-13 10:37:11 +08:00
import org.apache.flink.util.OutputTag ;
2024-01-22 17:33:39 +08:00
import org.junit.* ;
import java.io.* ;
2024-05-15 15:38:29 +08:00
import java.nio.file.Files ;
import java.nio.file.Paths ;
2024-01-22 17:33:39 +08:00
import java.util.* ;
import java.util.concurrent.ConcurrentLinkedQueue ;
public class FileChunkCombinerTests {
private byte [ ] pcapngFileBytes ;
private List < FileChunk > inputFileChunks ;
2024-03-13 10:37:11 +08:00
private List < FileChunk > inputFiles ;
2024-01-22 17:33:39 +08:00
private List < byte [ ] > messagePackList ;
private List < FileChunk > emlFileChunks ;
private List < FileChunk > pcapngFileChunks ;
private List < FileChunk > pcapngIncludeMetaFileChunks ;
private Map < String , Object > pcapngFileMeta ;
2024-07-10 17:07:29 +08:00
private final int emlLength = 16607 ;
2024-05-15 15:38:29 +08:00
private final int emlChunkCount = 10 ;
private final int pcapngChunkCount = 10 ;
private final String pcapChunkData = " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa " ;
2024-01-22 17:33:39 +08:00
private static Configuration configuration ;
2024-03-13 10:37:11 +08:00
private CombineChunkProcessWindowFunction processWindowFunction ;
private OutputTag < FileChunk > delayedChunkOutputTag ;
private KeyedOneInputStreamOperatorTestHarness < String , FileChunk , FileChunk > testHarness ;
2024-01-22 17:33:39 +08:00
@Before
public void testBefore ( ) throws Exception {
2024-03-13 10:37:11 +08:00
ParameterTool parameterTool = ParameterTool . fromPropertiesFile ( FileChunkCombinerTests . class . getClassLoader ( ) . getResource ( " common.properties " ) . getPath ( ) ) ;
2024-01-22 17:33:39 +08:00
configuration = parameterTool . getConfiguration ( ) ;
String filePath = " src " + File . separator + " test " + File . separator + " data " + File . separator + " test.eml " ;
StringBuilder pcapData = new StringBuilder ( ) ;
for ( int i = 0 ; i < 10 ; i + + ) {
pcapData . append ( pcapChunkData ) ;
}
pcapngFileBytes = pcapData . toString ( ) . getBytes ( ) ;
pcapngFileMeta = new HashMap < > ( ) ;
pcapngFileMeta . put ( " ruleId " , 151 ) ;
pcapngFileMeta . put ( " taskId " , 7477 ) ;
pcapngFileMeta . put ( " sledIP " , " 127.0.0.1 " ) ;
emlFileChunks = new ArrayList < > ( ) ;
pcapngFileChunks = new ArrayList < > ( ) ;
pcapngIncludeMetaFileChunks = new ArrayList < > ( ) ;
2024-05-15 15:38:29 +08:00
ObjectInputStream messagePacksInputStream = new ObjectInputStream ( Files . newInputStream ( Paths . get ( " src " + File . separator + " test " + File . separator + " data " + File . separator + " messagePacks " ) ) ) ;
2024-03-13 10:37:11 +08:00
messagePackList = ( List < byte [ ] > ) messagePacksInputStream . readObject ( ) ;
messagePacksInputStream . close ( ) ;
2024-05-15 15:38:29 +08:00
ObjectInputStream fileChunksInputStream = new ObjectInputStream ( Files . newInputStream ( Paths . get ( " src " + File . separator + " test " + File . separator + " data " + File . separator + " fileChunks " ) ) ) ;
2024-03-13 10:37:11 +08:00
inputFileChunks = ( List < FileChunk > ) fileChunksInputStream . readObject ( ) ;
fileChunksInputStream . close ( ) ;
2024-05-15 15:38:29 +08:00
ObjectInputStream filesInputStream = new ObjectInputStream ( Files . newInputStream ( Paths . get ( " src " + File . separator + " test " + File . separator + " data " + File . separator + " files " ) ) ) ;
2024-03-13 10:37:11 +08:00
inputFiles = ( List < FileChunk > ) filesInputStream . readObject ( ) ;
filesInputStream . close ( ) ;
StreamExecutionEnvironment env = StreamExecutionEnvironment . getExecutionEnvironment ( ) ;
List < Trigger < Object , TimeWindow > > triggers = new ArrayList < > ( ) ;
triggers . add ( EventTimeTrigger . create ( ) ) ;
triggers . add ( LastChunkOrNoDataInTimeTrigger . of ( 1000 ) ) ;
Trigger < Object , TimeWindow > trigger = MultipleTrigger . of ( triggers ) ;
2024-07-08 10:07:07 +08:00
processWindowFunction = new CombineChunkProcessWindowFunction ( ) ;
2024-03-19 15:11:02 +08:00
delayedChunkOutputTag = new OutputTag < FileChunk > ( " delayed-chunk " ) {
} ;
2024-03-13 10:37:11 +08:00
DataStreamSource < FileChunk > source = env . fromCollection ( inputFileChunks ) ;
DataStream < FileChunk > window = source
. keyBy ( new FileChunkKeySelector ( ) )
. window ( TumblingEventTimeWindows . of ( Time . seconds ( 3 ) ) )
. trigger ( trigger )
. sideOutputLateData ( delayedChunkOutputTag )
. process ( processWindowFunction ) ;
OneInputTransformation < FileChunk , FileChunk > transform = ( OneInputTransformation < FileChunk , FileChunk > ) window . getTransformation ( ) ;
OneInputStreamOperator < FileChunk , FileChunk > operator = transform . getOperator ( ) ;
WindowOperator < String , FileChunk , FileChunk , FileChunk , TimeWindow > winOperator = ( WindowOperator < String , FileChunk , FileChunk , FileChunk , TimeWindow > ) operator ;
testHarness = new KeyedOneInputStreamOperatorTestHarness < > ( winOperator , winOperator . getKeySelector ( ) , BasicTypeInfo . STRING_TYPE_INFO ) ;
2024-01-22 17:33:39 +08:00
}
@Test
2024-03-05 17:26:52 +08:00
public void testParseMessagePackMapFunction ( ) throws Exception {
2024-05-15 15:38:29 +08:00
ParseMessagePackMapFunction mapFunction = new ParseMessagePackMapFunction ( ) ;
2024-03-13 10:37:11 +08:00
OneInputStreamOperatorTestHarness < byte [ ] , FileChunk > testHarness = new OneInputStreamOperatorTestHarness < > ( new StreamMap < > ( mapFunction ) ) ;
2024-03-05 17:26:52 +08:00
testHarness . setup ( ) ;
testHarness . open ( ) ;
2024-01-22 17:33:39 +08:00
for ( byte [ ] messagePack : messagePackList ) {
2024-03-05 17:26:52 +08:00
testHarness . processElement ( new StreamRecord < > ( messagePack ) ) ;
}
2024-03-13 10:37:11 +08:00
ConcurrentLinkedQueue < Object > expectedOutput = new ConcurrentLinkedQueue < > ( ) ;
for ( FileChunk fileChunk : inputFileChunks ) {
expectedOutput . add ( new StreamRecord < > ( fileChunk ) ) ;
2024-01-22 17:33:39 +08:00
}
2024-03-13 10:37:11 +08:00
ConcurrentLinkedQueue < Object > actualOutput = testHarness . getOutput ( ) ;
Assert . assertEquals ( 30 , actualOutput . size ( ) ) ;
TestHarnessUtil . assertOutputEqualsSorted ( " Output was not correct. " , expectedOutput , actualOutput , ( o1 , o2 ) - > {
StreamRecord sr0 = ( StreamRecord ) o1 ;
StreamRecord sr1 = ( StreamRecord ) o2 ;
return ( ( FileChunk ) sr0 . getValue ( ) ) . getUuid ( ) . compareTo ( ( ( FileChunk ) sr1 . getValue ( ) ) . getUuid ( ) ) ;
} ) ;
2024-05-15 15:38:29 +08:00
Assert . assertEquals ( 30 , mapFunction . chunksInCounter . getCount ( ) ) ;
Assert . assertEquals ( 30 , mapFunction . chunksOutCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , mapFunction . errorChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 22 , mapFunction . lessThan1KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 8 , mapFunction . between1KBAnd3KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , mapFunction . between3KBAnd5KBChunksCounter . getCount ( ) ) ;
2024-03-19 15:11:02 +08:00
Assert . assertEquals ( 0 , mapFunction . between5KBAnd10KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , mapFunction . between10KBAnd50KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , mapFunction . between50KBAnd100KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , mapFunction . greaterThan100KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 10 , mapFunction . emlChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 20 , mapFunction . pcapngChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , mapFunction . txtChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , mapFunction . htmlChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , mapFunction . mediaChunksCounter . getCount ( ) ) ;
testHarness . close ( ) ;
}
2024-03-05 17:26:52 +08:00
@Test
public void testFileChunkFilterFunction ( ) throws Exception {
2024-05-15 15:38:29 +08:00
FileChunkFilterFunction fileChunkFilterFunction = new FileChunkFilterFunction ( " FileChunk.fileType == \" eml \" " , " test " ) ;
2024-03-13 10:37:11 +08:00
StreamFilter < FileChunk > fileChunkStreamFilter = new StreamFilter < > ( fileChunkFilterFunction ) ;
2024-03-05 17:26:52 +08:00
OneInputStreamOperatorTestHarness < FileChunk , FileChunk > testHarness = new OneInputStreamOperatorTestHarness < > ( fileChunkStreamFilter ) ;
testHarness . setup ( ) ;
testHarness . open ( ) ;
ConcurrentLinkedQueue < Object > expectedOutput = new ConcurrentLinkedQueue < > ( ) ;
for ( FileChunk fileChunk : inputFileChunks ) {
testHarness . processElement ( new StreamRecord < > ( fileChunk ) ) ;
if ( " eml " . equals ( fileChunk . getFileType ( ) ) ) {
expectedOutput . add ( new StreamRecord < > ( fileChunk ) ) ;
}
}
ConcurrentLinkedQueue < Object > actualOutput = testHarness . getOutput ( ) ;
Assert . assertEquals ( 10 , actualOutput . size ( ) ) ;
TestHarnessUtil . assertOutputEqualsSorted ( " Output was not correct. " , expectedOutput , actualOutput , new Comparator < Object > ( ) {
@Override
public int compare ( Object o1 , Object o2 ) {
StreamRecord sr0 = ( StreamRecord ) o1 ;
StreamRecord sr1 = ( StreamRecord ) o2 ;
2024-03-13 10:37:11 +08:00
return ( ( FileChunk ) sr0 . getValue ( ) ) . getUuid ( ) . compareTo ( ( ( FileChunk ) sr1 . getValue ( ) ) . getUuid ( ) ) ;
2024-03-05 17:26:52 +08:00
}
} ) ;
2024-03-19 15:11:02 +08:00
Assert . assertEquals ( 20 , fileChunkFilterFunction . filterChunksCounter . getCount ( ) ) ;
2024-03-05 17:26:52 +08:00
testHarness . close ( ) ;
}
@Test
public void testCombineChunkProcessWindowFunction ( ) throws Exception {
testHarness . open ( ) ;
2024-03-13 10:37:11 +08:00
testHarness . setProcessingTime ( 0L ) ;
testHarness . processWatermark ( - 9223372036854775808L ) ;
for ( FileChunk inputFileChunk : inputFileChunks ) {
testHarness . processElement ( new StreamRecord < > ( inputFileChunk , inputFileChunk . getTimestamp ( ) / 1000 ) ) ;
2024-03-05 17:26:52 +08:00
}
2024-03-13 10:37:11 +08:00
testHarness . setProcessingTime ( 9223372036854775807L ) ;
testHarness . processWatermark ( 9223372036854775807L ) ;
testHarness . close ( ) ;
List < Object > expectedOutput = new ArrayList < > ( inputFiles ) ;
List < Object > actualOutput = new ArrayList < > ( testHarness . extractOutputValues ( ) ) ;
Assert . assertEquals ( 3 , actualOutput . size ( ) ) ;
TestHarnessUtil . assertOutputEqualsSorted ( " Output was not correct. " , expectedOutput , actualOutput , Comparator . comparing ( o - > ( ( FileChunk ) o ) . getUuid ( ) ) ) ;
2024-05-15 15:38:29 +08:00
Assert . assertEquals ( 0 , processWindowFunction . errorChunksCounter . getCount ( ) ) ;
2024-03-19 15:11:02 +08:00
Assert . assertEquals ( 0 , processWindowFunction . duplicateChunksCounter . getCount ( ) ) ;
2024-03-05 17:26:52 +08:00
testHarness . close ( ) ;
2024-03-13 10:37:11 +08:00
}
@Test
public void testCombineChunkProcessWindowFunctionByOutputTag ( ) throws Exception {
2024-03-05 17:26:52 +08:00
testHarness . open ( ) ;
2024-03-13 10:37:11 +08:00
categorizeChunks ( inputFileChunks ) ;
long timestamp = 0L ;
for ( FileChunk fileChunk : emlFileChunks ) {
testHarness . processElement ( fileChunk , timestamp + = 10 ) ;
}
for ( FileChunk fileChunk : pcapngFileChunks ) {
testHarness . processElement ( fileChunk , timestamp + = 10 ) ;
}
testHarness . processWatermark ( 3000L ) ;
for ( FileChunk fileChunk : pcapngIncludeMetaFileChunks ) {
testHarness . processElement ( fileChunk , timestamp + = 10 ) ;
}
List < Object > expectedOutput = new ArrayList < > ( ) ;
expectedOutput . add ( inputFiles . get ( 0 ) ) ;
expectedOutput . add ( inputFiles . get ( 1 ) ) ;
List < Object > actualOutput = new ArrayList < > ( testHarness . extractOutputValues ( ) ) ;
Assert . assertEquals ( 2 , actualOutput . size ( ) ) ;
TestHarnessUtil . assertOutputEqualsSorted ( " Output was not correct. " , expectedOutput , actualOutput , Comparator . comparing ( o - > ( ( FileChunk ) o ) . getUuid ( ) ) ) ;
ConcurrentLinkedQueue < StreamRecord < FileChunk > > sideOutput = testHarness . getSideOutput ( delayedChunkOutputTag ) ;
List < Object > expectedSideOutput = new ArrayList < > ( pcapngIncludeMetaFileChunks ) ;
List < Object > actualSideOutput = new ArrayList < > ( ) ;
for ( StreamRecord < FileChunk > streamRecord : sideOutput ) {
actualSideOutput . add ( streamRecord . getValue ( ) ) ;
}
Assert . assertEquals ( 10 , sideOutput . size ( ) ) ;
TestHarnessUtil . assertOutputEqualsSorted ( " Output was not correct. " , expectedSideOutput , actualSideOutput , Comparator . comparing ( o - > ( ( FileChunk ) o ) . getUuid ( ) ) ) ;
2024-05-15 15:38:29 +08:00
Assert . assertEquals ( 0 , processWindowFunction . errorChunksCounter . getCount ( ) ) ;
2024-03-19 15:11:02 +08:00
Assert . assertEquals ( 0 , processWindowFunction . duplicateChunksCounter . getCount ( ) ) ;
2024-03-13 10:37:11 +08:00
testHarness . close ( ) ;
}
@Test
public void testCombineChunkProcessWindowFunctionByDuplicateChunk ( ) throws Exception {
testHarness . open ( ) ;
categorizeChunks ( inputFileChunks ) ;
pcapngFileChunks . add ( pcapngFileChunks . get ( 5 ) ) ;
pcapngIncludeMetaFileChunks . add ( pcapngIncludeMetaFileChunks . get ( 5 ) ) ;
long timestamp = 0L ;
testHarness . processElement ( emlFileChunks . get ( 5 ) , timestamp + 100 ) ;
for ( FileChunk fileChunk : emlFileChunks ) {
testHarness . processElement ( fileChunk , timestamp + = 10 ) ;
}
for ( FileChunk fileChunk : pcapngFileChunks ) {
testHarness . processElement ( fileChunk , timestamp + = 10 ) ;
}
for ( FileChunk fileChunk : pcapngIncludeMetaFileChunks ) {
testHarness . processElement ( fileChunk , timestamp + = 10 ) ;
2024-03-05 17:26:52 +08:00
}
testHarness . setProcessingTime ( 5000L ) ;
2024-03-13 10:37:11 +08:00
List < FileChunk > actualOutput = testHarness . extractOutputValues ( ) ;
Assert . assertEquals ( 3 , actualOutput . size ( ) ) ;
Assert . assertEquals ( inputFiles . get ( 0 ) , actualOutput . get ( 0 ) ) ;
Assert . assertEquals ( inputFiles . get ( 1 ) . getChunk ( ) . length + pcapngFileChunks . get ( 5 ) . getChunk ( ) . length , actualOutput . get ( 1 ) . getChunk ( ) . length ) ;
Assert . assertEquals ( inputFiles . get ( 2 ) . getChunk ( ) . length + pcapngIncludeMetaFileChunks . get ( 5 ) . getChunk ( ) . length , actualOutput . get ( 2 ) . getChunk ( ) . length ) ;
2024-05-15 15:38:29 +08:00
Assert . assertEquals ( 0 , processWindowFunction . errorChunksCounter . getCount ( ) ) ;
2024-03-19 15:11:02 +08:00
Assert . assertEquals ( 1 , processWindowFunction . duplicateChunksCounter . getCount ( ) ) ;
2024-03-13 10:37:11 +08:00
testHarness . close ( ) ;
}
@Test
public void testCombineChunkProcessWindowFunctionByLostChunk ( ) throws Exception {
testHarness . open ( ) ;
categorizeChunks ( inputFileChunks ) ;
emlFileChunks . remove ( emlFileChunks . get ( 5 ) ) ;
pcapngFileChunks . remove ( pcapngFileChunks . get ( 5 ) ) ;
pcapngIncludeMetaFileChunks . remove ( pcapngIncludeMetaFileChunks . get ( 5 ) ) ;
long timestamp = 0L ;
for ( FileChunk fileChunk : emlFileChunks ) {
testHarness . processElement ( fileChunk , timestamp + = 10 ) ;
}
for ( FileChunk fileChunk : pcapngFileChunks ) {
testHarness . processElement ( fileChunk , timestamp + = 10 ) ;
}
for ( FileChunk fileChunk : pcapngIncludeMetaFileChunks ) {
testHarness . processElement ( fileChunk , timestamp + = 10 ) ;
}
testHarness . setProcessingTime ( 5000L ) ;
List < FileChunk > actualOutput = testHarness . extractOutputValues ( ) ;
Assert . assertEquals ( 4 , actualOutput . size ( ) ) ;
Assert . assertEquals ( inputFiles . get ( 0 ) . getChunk ( ) . length - emlFileChunks . get ( 5 ) . getChunk ( ) . length , actualOutput . get ( 0 ) . getChunk ( ) . length + actualOutput . get ( 1 ) . getChunk ( ) . length ) ;
Assert . assertEquals ( inputFiles . get ( 1 ) . getChunk ( ) . length - pcapngFileChunks . get ( 5 ) . getChunk ( ) . length , actualOutput . get ( 2 ) . getChunk ( ) . length ) ;
Assert . assertEquals ( inputFiles . get ( 2 ) . getChunk ( ) . length - pcapngIncludeMetaFileChunks . get ( 5 ) . getChunk ( ) . length , actualOutput . get ( 3 ) . getChunk ( ) . length ) ;
2024-05-15 15:38:29 +08:00
Assert . assertEquals ( 0 , processWindowFunction . errorChunksCounter . getCount ( ) ) ;
2024-03-19 15:11:02 +08:00
Assert . assertEquals ( 0 , processWindowFunction . duplicateChunksCounter . getCount ( ) ) ;
2024-03-05 17:26:52 +08:00
testHarness . close ( ) ;
}
2024-07-08 10:07:07 +08:00
//测试hos sink, 需配置可用的hos地址
2024-03-05 17:26:52 +08:00
@Test
public void testHosSink ( ) throws Exception {
//测试单条上传
configuration . setString ( Configs . SINK_TYPE , " hos " ) ;
2024-07-08 10:07:07 +08:00
configuration . setLong ( Configs . SINK_HOS_BATCH_SIZE , 0L ) ;
configuration . setInteger ( Configs . SINK_HOS_BATCH_INTERVAL_MS , 0 ) ;
2024-03-05 17:26:52 +08:00
HosSink hosSink = new HosSink ( configuration ) ;
StreamSink < FileChunk > fileChunkStreamSink = new StreamSink < > ( hosSink ) ;
OneInputStreamOperatorTestHarness < FileChunk , Object > testHarness = new OneInputStreamOperatorTestHarness < > ( fileChunkStreamSink ) ;
testHarness . setup ( ) ;
testHarness . open ( ) ;
byte [ ] data = RandomUtil . randomString ( 1000 ) . getBytes ( ) ;
2024-03-13 10:37:11 +08:00
//seek文件
2024-07-08 10:07:07 +08:00
FileChunk fileChunk = new FileChunk ( PublicUtil . getUUID ( ) , " eml " , 0 , data . length , data , " seek " , 1 , 5 , System . currentTimeMillis ( ) * 1000 ) ;
2024-03-05 17:26:52 +08:00
testHarness . processElement ( new StreamRecord < > ( fileChunk ) ) ;
2024-05-15 15:38:29 +08:00
Assert . assertEquals ( 1 , hosSink . chunksInCounter . getCount ( ) ) ;
Assert . assertEquals ( 1 , hosSink . chunksOutCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hosSink . errorChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 1 , hosSink . filesCounter . getCount ( ) ) ;
Assert . assertEquals ( 1 , hosSink . lessThan1KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hosSink . between1KBAnd5KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hosSink . between5KBAnd10KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hosSink . between10KBAnd100KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hosSink . between100KBAnd1MBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hosSink . greaterThan1MBChunksCounter . getCount ( ) ) ;
2024-03-13 10:37:11 +08:00
//append文件
2024-07-08 10:07:07 +08:00
fileChunk = new FileChunk ( PublicUtil . getUUID ( ) , " pcapng " , data . length , data , " append " , 5 , System . currentTimeMillis ( ) * 1000 , pcapngFileMeta , " 1-200,2-200,3-200,4-200,5-200 " ) ;
2024-03-05 17:26:52 +08:00
testHarness . processElement ( new StreamRecord < > ( fileChunk ) ) ;
2024-05-15 15:38:29 +08:00
Assert . assertEquals ( 2 , hosSink . chunksInCounter . getCount ( ) ) ;
Assert . assertEquals ( 2 , hosSink . chunksOutCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hosSink . errorChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 1 , hosSink . filesCounter . getCount ( ) ) ;
Assert . assertEquals ( 2 , hosSink . lessThan1KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hosSink . between1KBAnd5KBChunksCounter . getCount ( ) ) ;
2024-03-19 15:11:02 +08:00
Assert . assertEquals ( 0 , hosSink . between5KBAnd10KBChunksCounter . getCount ( ) ) ;
2024-05-15 15:38:29 +08:00
Assert . assertEquals ( 0 , hosSink . between10KBAnd100KBChunksCounter . getCount ( ) ) ;
2024-03-19 15:11:02 +08:00
Assert . assertEquals ( 0 , hosSink . between100KBAnd1MBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hosSink . greaterThan1MBChunksCounter . getCount ( ) ) ;
testHarness . close ( ) ;
2024-05-15 15:38:29 +08:00
2024-03-05 17:26:52 +08:00
//测试批量上传
2024-03-19 15:11:02 +08:00
data = RandomUtil . randomString ( 10000 ) . getBytes ( ) ;
2024-03-05 17:26:52 +08:00
configuration . setString ( Configs . SINK_TYPE , " hos " ) ;
2024-07-08 10:07:07 +08:00
configuration . setLong ( Configs . SINK_HOS_BATCH_SIZE , 1024 * 1024L ) ;
configuration . setInteger ( Configs . SINK_HOS_BATCH_INTERVAL_MS , 2000 ) ;
2024-03-05 17:26:52 +08:00
hosSink = new HosSink ( configuration ) ;
fileChunkStreamSink = new StreamSink < > ( hosSink ) ;
testHarness = new OneInputStreamOperatorTestHarness < > ( fileChunkStreamSink ) ;
testHarness . setup ( ) ;
testHarness . open ( ) ;
2024-07-08 10:07:07 +08:00
fileChunk = new FileChunk ( PublicUtil . getUUID ( ) , " eml " , 0 , data . length , data , " seek " , 1 , 5 , System . currentTimeMillis ( ) * 1000 ) ;
2024-03-05 17:26:52 +08:00
testHarness . processElement ( new StreamRecord < > ( fileChunk ) ) ;
2024-07-08 10:07:07 +08:00
fileChunk = new FileChunk ( PublicUtil . getUUID ( ) , " eml " , 0 , data . length , data , " seek " , 1 , 5 , System . currentTimeMillis ( ) * 1000 ) ;
2024-03-05 17:26:52 +08:00
testHarness . processElement ( new StreamRecord < > ( fileChunk ) ) ;
2024-07-08 10:07:07 +08:00
Thread . sleep ( configuration . getInteger ( Configs . SINK_HOS_BATCH_INTERVAL_MS ) + 1000 ) ;
2024-05-15 15:38:29 +08:00
Assert . assertEquals ( 2 , hosSink . chunksInCounter . getCount ( ) ) ;
Assert . assertEquals ( 2 , hosSink . chunksOutCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hosSink . errorChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 2 , hosSink . filesCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hosSink . lessThan1KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hosSink . between1KBAnd5KBChunksCounter . getCount ( ) ) ;
2024-03-19 15:11:02 +08:00
Assert . assertEquals ( 2 , hosSink . between5KBAnd10KBChunksCounter . getCount ( ) ) ;
2024-05-15 15:38:29 +08:00
Assert . assertEquals ( 0 , hosSink . between10KBAnd100KBChunksCounter . getCount ( ) ) ;
2024-03-19 15:11:02 +08:00
Assert . assertEquals ( 0 , hosSink . between100KBAnd1MBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hosSink . greaterThan1MBChunksCounter . getCount ( ) ) ;
2024-03-05 17:26:52 +08:00
testHarness . close ( ) ;
}
2024-07-08 10:07:07 +08:00
//测试hbase sink, 需配置可用的hbase地址
2024-03-05 17:26:52 +08:00
@Test
public void testHBaseSink ( ) throws Exception {
2024-07-08 10:07:07 +08:00
//测试单条上传
2024-03-05 17:26:52 +08:00
configuration . setString ( Configs . SINK_TYPE , " hbase " ) ;
2024-07-08 10:07:07 +08:00
configuration . setLong ( Configs . SINK_HBASE_BATCH_SIZE , 0L ) ;
configuration . setInteger ( Configs . SINK_HBASE_BATCH_INTERVAL_MS , 0 ) ;
2024-03-05 17:26:52 +08:00
HBaseSink hBaseSink = new HBaseSink ( configuration ) ;
StreamSink < FileChunk > fileChunkStreamSink = new StreamSink < > ( hBaseSink ) ;
OneInputStreamOperatorTestHarness < FileChunk , Object > testHarness = new OneInputStreamOperatorTestHarness < > ( fileChunkStreamSink ) ;
testHarness . setup ( ) ;
testHarness . open ( ) ;
byte [ ] data = RandomUtil . randomString ( 1000 ) . getBytes ( ) ;
2024-07-08 10:07:07 +08:00
FileChunk fileChunk = new FileChunk ( PublicUtil . getUUID ( ) , " eml " , 0 , data . length , data , " seek " , 1 , 5 , System . currentTimeMillis ( ) ) ;
testHarness . processElement ( new StreamRecord < > ( fileChunk ) ) ;
Assert . assertEquals ( 1 , hBaseSink . chunksInCounter . getCount ( ) ) ;
Assert . assertEquals ( 1 , hBaseSink . chunksOutCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hBaseSink . errorChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 1 , hBaseSink . filesCounter . getCount ( ) ) ;
Assert . assertEquals ( 1 , hBaseSink . lessThan1KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hBaseSink . between1KBAnd5KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hBaseSink . between5KBAnd10KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hBaseSink . between10KBAnd100KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hBaseSink . between100KBAnd1MBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hBaseSink . greaterThan1MBChunksCounter . getCount ( ) ) ;
testHarness . close ( ) ;
//测试批量上传
configuration . setString ( Configs . SINK_TYPE , " hbase " ) ;
configuration . setLong ( Configs . SINK_HBASE_BATCH_SIZE , 1024 * 1024L ) ;
configuration . setInteger ( Configs . SINK_HBASE_BATCH_INTERVAL_MS , 2000 ) ;
hBaseSink = new HBaseSink ( configuration ) ;
fileChunkStreamSink = new StreamSink < > ( hBaseSink ) ;
testHarness = new OneInputStreamOperatorTestHarness < > ( fileChunkStreamSink ) ;
testHarness . setup ( ) ;
testHarness . open ( ) ;
data = RandomUtil . randomString ( 1000 ) . getBytes ( ) ;
fileChunk = new FileChunk ( PublicUtil . getUUID ( ) , " eml " , 0 , data . length , data , " seek " , 1 , 5 , System . currentTimeMillis ( ) ) ;
2024-03-05 17:26:52 +08:00
testHarness . processElement ( new StreamRecord < > ( fileChunk ) ) ;
2024-07-08 10:07:07 +08:00
fileChunk = new FileChunk ( PublicUtil . getUUID ( ) , " eml " , 0 , data . length , data , " seek " , 1 , 5 , System . currentTimeMillis ( ) ) ;
2024-03-05 17:26:52 +08:00
testHarness . processElement ( new StreamRecord < > ( fileChunk ) ) ;
2024-07-08 10:07:07 +08:00
Thread . sleep ( configuration . getInteger ( Configs . SINK_HBASE_BATCH_INTERVAL_MS ) + 1000 ) ;
2024-05-15 15:38:29 +08:00
Assert . assertEquals ( 2 , hBaseSink . chunksInCounter . getCount ( ) ) ;
Assert . assertEquals ( 2 , hBaseSink . chunksOutCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hBaseSink . errorChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 2 , hBaseSink . filesCounter . getCount ( ) ) ;
Assert . assertEquals ( 2 , hBaseSink . lessThan1KBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hBaseSink . between1KBAnd5KBChunksCounter . getCount ( ) ) ;
2024-03-19 15:11:02 +08:00
Assert . assertEquals ( 0 , hBaseSink . between5KBAnd10KBChunksCounter . getCount ( ) ) ;
2024-05-15 15:38:29 +08:00
Assert . assertEquals ( 0 , hBaseSink . between10KBAnd100KBChunksCounter . getCount ( ) ) ;
2024-03-19 15:11:02 +08:00
Assert . assertEquals ( 0 , hBaseSink . between100KBAnd1MBChunksCounter . getCount ( ) ) ;
Assert . assertEquals ( 0 , hBaseSink . greaterThan1MBChunksCounter . getCount ( ) ) ;
2024-03-05 17:26:52 +08:00
testHarness . close ( ) ;
}
2024-01-22 17:33:39 +08:00
@Test
public void testPipelineFullChunk ( ) throws Exception {
CollectSink . values . clear ( ) ;
long windowTime = 5 ;
messagePackList . sort ( Comparator . comparingInt ( Arrays : : hashCode ) ) ; //打乱顺序
StreamExecutionEnvironment env = createPipeline ( 2 , new ByteDataSource ( messagePackList , 500 , windowTime ) , windowTime , 1 ) ;
env . execute ( ) ;
List < FileChunk > fileChunks = CollectSink . values ;
2024-05-15 15:38:29 +08:00
Assert . assertFalse ( " 合并错误,sink输出错误 " , fileChunks . isEmpty ( ) ) ;
2024-01-22 17:33:39 +08:00
categorizeChunks ( fileChunks ) ;
byte [ ] data = new byte [ 0 ] ;
long length = 0 ;
long chunkCount = 0 ;
int lastChunkFlag = 0 ;
emlFileChunks . sort ( Comparator . comparingLong ( FileChunk : : getOffset ) ) ;
for ( FileChunk fileChunk : emlFileChunks ) {
data = ArrayUtil . addAll ( data , fileChunk . getChunk ( ) ) ;
length + = fileChunk . getLength ( ) ;
chunkCount + = fileChunk . getChunkCount ( ) ;
if ( fileChunk . getLastChunkFlag ( ) = = 1 ) {
lastChunkFlag = 1 ;
}
}
Assert . assertEquals ( " seek模式合并错误, lastChunkFlag错误 " , 1 , lastChunkFlag ) ;
Assert . assertEquals ( " seek模式合并错误, chunkCount错误 " , emlChunkCount - 1 , chunkCount ) ;
2024-07-10 17:07:29 +08:00
Assert . assertEquals ( " seek模式合并错误, 文件长度错误 " , emlLength , length ) ;
2024-01-22 17:33:39 +08:00
data = new byte [ 0 ] ;
length = 0 ;
chunkCount = 0 ;
for ( FileChunk fileChunk : pcapngFileChunks ) {
data = ArrayUtil . addAll ( data , fileChunk . getChunk ( ) ) ;
length + = fileChunk . getLength ( ) ;
chunkCount + = fileChunk . getChunkCount ( ) ;
}
Assert . assertEquals ( " append模式合并错误, chunkCount错误 " , pcapngChunkCount , chunkCount ) ;
Assert . assertEquals ( " append模式合并错误, 文件长度错误 " , pcapngFileBytes . length , length ) ;
Assert . assertEquals ( " append模式合并错误, 文件内容错误 " , new String ( pcapngFileBytes ) , new String ( data ) ) ;
data = new byte [ 0 ] ;
length = 0 ;
chunkCount = 0 ;
for ( FileChunk fileChunk : pcapngIncludeMetaFileChunks ) {
data = ArrayUtil . addAll ( data , fileChunk . getChunk ( ) ) ;
length + = fileChunk . getLength ( ) ;
chunkCount + = fileChunk . getChunkCount ( ) ;
Assert . assertEquals ( " 合并错误,元信息错误 " , pcapngFileMeta , fileChunk . getMeta ( ) ) ;
}
}
@Test
public void testPipelineLostChunk ( ) throws Exception {
CollectSink . values . clear ( ) ;
long windowTime = 5 ;
//删除部分chunk
messagePackList . remove ( 5 ) ;
messagePackList . remove ( 15 ) ;
messagePackList . remove ( 25 ) ;
messagePackList . sort ( Comparator . comparingInt ( Arrays : : hashCode ) ) ; //打乱顺序
StreamExecutionEnvironment env = createPipeline ( 2 , new ByteDataSource ( messagePackList , 500 , windowTime ) , windowTime , 1 ) ;
env . execute ( ) ;
List < FileChunk > fileChunks = CollectSink . values ;
2024-05-15 15:38:29 +08:00
Assert . assertFalse ( " 合并错误,sink输出错误 " , fileChunks . isEmpty ( ) ) ;
2024-01-22 17:33:39 +08:00
categorizeChunks ( fileChunks ) ;
byte [ ] data = new byte [ 0 ] ;
long length = 0 ;
long chunkCount = 0 ;
int lastChunkFlag = 0 ;
emlFileChunks . sort ( Comparator . comparingLong ( FileChunk : : getOffset ) ) ;
for ( FileChunk fileChunk : emlFileChunks ) {
data = ArrayUtil . addAll ( data , fileChunk . getChunk ( ) ) ;
length + = fileChunk . getLength ( ) ;
chunkCount + = fileChunk . getChunkCount ( ) ;
if ( fileChunk . getLastChunkFlag ( ) = = 1 ) {
lastChunkFlag = 1 ;
}
}
Assert . assertEquals ( " seek模式合并错误, lastChunkFlag错误 " , 1 , lastChunkFlag ) ;
Assert . assertEquals ( " seek模式合并错误, chunkCount错误 " , emlChunkCount - 2 , chunkCount ) ;
2024-07-10 17:07:29 +08:00
Assert . assertEquals ( " seek模式合并错误, 文件长度错误 " , emlLength - 2000 , length ) ;
2024-01-22 17:33:39 +08:00
data = new byte [ 0 ] ;
length = 0 ;
chunkCount = 0 ;
for ( FileChunk fileChunk : pcapngFileChunks ) {
data = ArrayUtil . addAll ( data , fileChunk . getChunk ( ) ) ;
length + = fileChunk . getLength ( ) ;
chunkCount + = fileChunk . getChunkCount ( ) ;
}
Assert . assertEquals ( " append模式合并错误, chunkCount错误 " , pcapngChunkCount - 1 , chunkCount ) ;
Assert . assertEquals ( " append模式合并错误, 文件长度错误 " , pcapngFileBytes . length - pcapChunkData . length ( ) , length ) ;
Assert . assertEquals ( " append模式合并错误, 文件内容错误 " , new String ( ArrayUtil . sub ( pcapngFileBytes , 0 , pcapngFileBytes . length - pcapChunkData . length ( ) ) ) , new String ( data ) ) ;
data = new byte [ 0 ] ;
length = 0 ;
chunkCount = 0 ;
for ( FileChunk fileChunk : pcapngIncludeMetaFileChunks ) {
data = ArrayUtil . addAll ( data , fileChunk . getChunk ( ) ) ;
length + = fileChunk . getLength ( ) ;
chunkCount + = fileChunk . getChunkCount ( ) ;
Assert . assertEquals ( " 合并错误,元信息错误 " , pcapngFileMeta , fileChunk . getMeta ( ) ) ;
}
}
@Test
public void testPipelineDuplicateChunk ( ) throws Exception {
CollectSink . values . clear ( ) ;
2024-07-10 18:52:37 +08:00
long windowTime = 10 ;
2024-01-22 17:33:39 +08:00
//添加重复chunk
messagePackList . add ( messagePackList . get ( 5 ) ) ;
messagePackList . add ( messagePackList . get ( 15 ) ) ;
messagePackList . add ( messagePackList . get ( 25 ) ) ;
messagePackList . sort ( Comparator . comparingInt ( Arrays : : hashCode ) ) ; //打乱顺序
StreamExecutionEnvironment env = createPipeline ( 2 , new ByteDataSource ( messagePackList , 500 , windowTime ) , windowTime , 1 ) ;
env . execute ( ) ;
List < FileChunk > fileChunks = CollectSink . values ;
2024-05-15 15:38:29 +08:00
Assert . assertFalse ( " 合并错误,sink输出错误 " , fileChunks . isEmpty ( ) ) ;
2024-01-22 17:33:39 +08:00
categorizeChunks ( fileChunks ) ;
byte [ ] data = new byte [ 0 ] ;
long length = 0 ;
long chunkCount = 0 ;
int lastChunkFlag = 0 ;
emlFileChunks . sort ( Comparator . comparingLong ( FileChunk : : getOffset ) ) ;
for ( FileChunk fileChunk : emlFileChunks ) {
data = ArrayUtil . addAll ( data , fileChunk . getChunk ( ) ) ;
length + = fileChunk . getLength ( ) ;
chunkCount + = fileChunk . getChunkCount ( ) ;
if ( fileChunk . getLastChunkFlag ( ) = = 1 ) {
lastChunkFlag = 1 ;
}
}
Assert . assertEquals ( " seek模式合并错误, lastChunkFlag错误 " , 1 , lastChunkFlag ) ;
Assert . assertEquals ( " seek模式合并错误, chunkCount错误 " , emlChunkCount - 1 , chunkCount ) ;
2024-07-10 17:07:29 +08:00
Assert . assertEquals ( " seek模式合并错误, 文件长度错误 " , emlLength , length ) ;
2024-01-22 17:33:39 +08:00
data = new byte [ 0 ] ;
length = 0 ;
chunkCount = 0 ;
for ( FileChunk fileChunk : pcapngFileChunks ) {
data = ArrayUtil . addAll ( data , fileChunk . getChunk ( ) ) ;
length + = fileChunk . getLength ( ) ;
chunkCount + = fileChunk . getChunkCount ( ) ;
}
Assert . assertEquals ( " append模式合并错误, chunkCount错误 " , pcapngChunkCount + 1 , chunkCount ) ;
Assert . assertEquals ( " append模式合并错误, 文件长度错误 " , pcapngFileBytes . length + pcapChunkData . length ( ) , length ) ;
Assert . assertEquals ( " append模式合并错误, 文件内容错误 " , new String ( ArrayUtil . addAll ( pcapngFileBytes , ArrayUtil . sub ( pcapngFileBytes , 0 , pcapChunkData . length ( ) ) ) ) , new String ( data ) ) ;
data = new byte [ 0 ] ;
length = 0 ;
chunkCount = 0 ;
for ( FileChunk fileChunk : pcapngIncludeMetaFileChunks ) {
data = ArrayUtil . addAll ( data , fileChunk . getChunk ( ) ) ;
length + = fileChunk . getLength ( ) ;
chunkCount + = fileChunk . getChunkCount ( ) ;
Assert . assertEquals ( " 合并错误,元信息错误 " , pcapngFileMeta , fileChunk . getMeta ( ) ) ;
}
}
@ClassRule
public static MiniClusterWithClientResource flinkCluster =
new MiniClusterWithClientResource (
new MiniClusterResourceConfiguration . Builder ( )
. setNumberSlotsPerTaskManager ( 1 )
. setNumberTaskManagers ( 2 )
. build ( ) ) ;
private static class CollectSink implements SinkFunction < FileChunk > {
private static final List < FileChunk > values = Collections . synchronizedList ( new ArrayList < > ( ) ) ;
@Override
public void invoke ( FileChunk value , Context context ) {
values . add ( value ) ;
}
}
private static class ByteDataSource implements SourceFunction < byte [ ] > {
private volatile boolean isRunning = true ;
private final List < byte [ ] > dataList ;
private final long delay ;
private final long windowTime ;
ByteDataSource ( List < byte [ ] > dataList , long delay , long windowTime ) {
this . dataList = dataList ;
this . delay = delay ;
this . windowTime = windowTime ;
}
@Override
public void run ( SourceContext < byte [ ] > ctx ) throws Exception {
int index = 0 ;
while ( isRunning & & index < dataList . size ( ) ) {
byte [ ] record = dataList . get ( index ) ;
ctx . collect ( record ) ;
index + + ;
Thread . sleep ( delay ) ;
}
// 发送完数据后,等待窗口执行完成
Thread . sleep ( windowTime * 1000 ) ;
}
@Override
public void cancel ( ) {
isRunning = false ;
}
}
private StreamExecutionEnvironment createPipeline ( int parallelism , SourceFunction < byte [ ] > source , long windowTime , long windowIdleTime ) {
StreamExecutionEnvironment env = StreamExecutionEnvironment . getExecutionEnvironment ( ) ;
env . setParallelism ( parallelism ) ;
2024-07-08 10:07:07 +08:00
2024-01-22 17:33:39 +08:00
List < Trigger < Object , TimeWindow > > triggers = new ArrayList < > ( ) ;
2024-07-08 10:07:07 +08:00
triggers . add ( ProcessingTimeTrigger . create ( ) ) ;
if ( configuration . get ( Configs . COMBINER_WINDOW_ENABLE_LAST_CHUNK_TRIGGER ) ) {
triggers . add ( LastChunkTrigger . create ( ) ) ;
}
2024-01-22 17:33:39 +08:00
Trigger < Object , TimeWindow > trigger = MultipleTrigger . of ( triggers ) ;
env . addSource ( source )
2024-05-15 15:38:29 +08:00
. map ( new ParseMessagePackMapFunction ( ) )
2024-05-16 16:00:21 +08:00
. filter ( new FileChunkFilterFunction ( " " , " test " ) )
2024-03-13 10:37:11 +08:00
. keyBy ( new FileChunkKeySelector ( ) , BasicTypeInfo . STRING_TYPE_INFO )
2024-07-08 10:07:07 +08:00
. window ( TumblingProcessingTimeWindows . of ( Time . seconds ( windowTime ) ) )
2024-01-22 17:33:39 +08:00
. trigger ( trigger )
2024-07-08 10:07:07 +08:00
. process ( new CombineChunkProcessWindowFunction ( ) )
2024-01-22 17:33:39 +08:00
. addSink ( new CollectSink ( ) ) ;
return env ;
}
private void categorizeChunks ( List < FileChunk > fileChunks ) {
for ( FileChunk fileChunk : fileChunks ) {
2024-03-13 10:37:11 +08:00
if ( " eml " . equals ( fileChunk . getFileType ( ) ) ) {
2024-01-22 17:33:39 +08:00
emlFileChunks . add ( fileChunk ) ;
2024-03-13 10:37:11 +08:00
} else if ( " pcapng " . equals ( fileChunk . getFileType ( ) ) & & fileChunk . getMeta ( ) = = null ) {
2024-01-22 17:33:39 +08:00
pcapngFileChunks . add ( fileChunk ) ;
2024-03-13 10:37:11 +08:00
} else if ( " pcapng " . equals ( fileChunk . getFileType ( ) ) & & fileChunk . getMeta ( ) ! = null ) {
2024-01-22 17:33:39 +08:00
pcapngIncludeMetaFileChunks . add ( fileChunk ) ;
}
}
}
2024-03-13 10:37:11 +08:00
2024-05-15 15:38:29 +08:00
// @Test
// public void testMock() throws Exception {
// ProcessWindowFunctionMock mock = Mockito.mock(ProcessWindowFunctionMock.class);
// InternalIterableProcessWindowFunction<FileChunk, FileChunk, String, TimeWindow> windowFunction = new InternalIterableProcessWindowFunction<>(mock);
// TypeInformation<FileChunk> fileChunkType = PojoTypeInfo.of(FileChunk.class);
// ExecutionConfig execConf = new ExecutionConfig();
// execConf.setParallelism(5);
// StreamingFunctionUtils.setOutputType(windowFunction, fileChunkType, execConf);
// Mockito.verify(mock).setOutputType(fileChunkType, execConf);
// Configuration config = new Configuration();
// windowFunction.open(config);
// Mockito.verify(mock).open(config);
// RuntimeContext rCtx = Mockito.mock(RuntimeContext.class);
// windowFunction.setRuntimeContext(rCtx);
// (Mockito.verify(mock)).setRuntimeContext(rCtx);
// TimeWindow w = Mockito.mock(TimeWindow.class);
// Iterable<FileChunk> i = Mockito.mock(Iterable.class);
// Collector<FileChunk> c = Mockito.mock(Collector.class);
// InternalWindowFunction.InternalWindowContext ctx = Mockito.mock(InternalWindowFunction.InternalWindowContext.class);
// (Mockito.doAnswer(new Answer() {
// public Object answer(InvocationOnMock invocationOnMock) throws Throwable {
// ProcessWindowFunction<FileChunk, FileChunk, String, TimeWindow>.Context c = (ProcessWindowFunction.Context) invocationOnMock.getArguments()[1];
// c.currentProcessingTime();
// c.currentWatermark();
// c.windowState();
// c.globalState();
// return null;
// }
// }).when(mock)).process(Mockito.anyString(), Mockito.anyObject(), Mockito.eq(i), Mockito.eq(c));
// windowFunction.process("", w, ctx, i, c);
// Mockito.verify(ctx).currentProcessingTime();
// Mockito.verify(ctx).currentWatermark();
// Mockito.verify(ctx).windowState();
// Mockito.verify(ctx).globalState();
// windowFunction.close();
// Mockito.verify(mock).close();
// }
2024-05-16 16:00:21 +08:00
//
// private static class ProcessWindowFunctionMock extends ProcessWindowFunction<FileChunk, FileChunk, String, TimeWindow> implements OutputTypeConfigurable<FileChunk> {
// private static final long serialVersionUID = 1L;
//
// private ProcessWindowFunctionMock() {
// }
//
// @Override
// public void process(String s, Context context, Iterable<FileChunk> elements, Collector<FileChunk> out) throws Exception {
// }
//
// public void setOutputType(TypeInformation<FileChunk> outTypeInfo, ExecutionConfig executionConfig) {
// }
// }
2024-05-15 15:38:29 +08:00
2024-03-13 10:37:11 +08:00
// @Test
// public void testCombineChunkProcessWindowFunction() throws Exception {
// List<Trigger<Object, TimeWindow>> triggers = new ArrayList<>();
// triggers.add(EventTimeTrigger.create());
// triggers.add(LastChunkOrNoDataInTimeTrigger.of(1000));
// Trigger<Object, TimeWindow> trigger = MultipleTrigger.of(triggers);
// TypeSerializer<FileChunk> serializer = TypeInformation.of(FileChunk.class).createSerializer(new ExecutionConfig());
// ListStateDescriptor listStateDescriptor = new ListStateDescriptor<>("test-seek-window", serializer);
// CombineChunkProcessWindowFunction processWindowFunction = new CombineChunkProcessWindowFunction(Integer.MAX_VALUE);
// WindowOperator<String, FileChunk, FileChunk, FileChunk, TimeWindow> operator = new WindowOperator<String, FileChunk, FileChunk, FileChunk, TimeWindow>(
// TumblingEventTimeWindows.of(Time.seconds(3)),
// new TimeWindow.Serializer(),
// new FileChunkKeySelector(),
// BasicTypeInfo.STRING_TYPE_INFO.createSerializer(new ExecutionConfig()),
// listStateDescriptor,
// new InternalIterableProcessWindowFunction(processWindowFunction),
// trigger,
// 0L, null);
// KeyedOneInputStreamOperatorTestHarness<String, FileChunk, FileChunk> testHarness = new KeyedOneInputStreamOperatorTestHarness<>(operator, new FileChunkKeySelector(), BasicTypeInfo.STRING_TYPE_INFO);
// testHarness.setup();
// testHarness.open();
// ConcurrentLinkedQueue<Object> expectedOutput = new ConcurrentLinkedQueue<>();
// for (FileChunk file : inputFiles) {
// expectedOutput.add(new StreamRecord<>(file, 2999L));
// }
// long timestamp = 0L;
// for (FileChunk fileChunk : inputFileChunks) {
// testHarness.processElement(fileChunk, timestamp += 10);
// }
// testHarness.setProcessingTime(5000L);
// ConcurrentLinkedQueue<Object> actualOutput = testHarness.getOutput();
// Assert.assertEquals(3, actualOutput.size());
// TestHarnessUtil.assertOutputEqualsSorted("Output was not correct.", expectedOutput, actualOutput, (o1, o2) -> {
// StreamRecord sr0 = (StreamRecord) o1;
// StreamRecord sr1 = (StreamRecord) o2;
// return ((FileChunk) sr0.getValue()).getUuid().compareTo(((FileChunk) sr1.getValue()).getUuid());
// });
// Assert.assertEquals(0, processWindowFunction.combineErrorCounter.getCount());
// Assert.assertEquals(0, processWindowFunction.duplicateChunkCounter.getCount());
// testHarness.close();
// }
2024-01-22 17:33:39 +08:00
}