diff --git a/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/datacenter_dt/etl_proxy_event_kafka_to_ndc_kafka b/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/datacenter_dt/etl_proxy_event_kafka_to_ndc_kafka index 7ab5423..1760cf3 100644 --- a/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/datacenter_dt/etl_proxy_event_kafka_to_ndc_kafka +++ b/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/datacenter_dt/etl_proxy_event_kafka_to_ndc_kafka @@ -73,11 +73,6 @@ processing_pipelines: parameters: value_expression: recv_time - - function: DOMAIN - lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni] - output_fields: [server_domain] - parameters: - option: FIRST_SIGNIFICANT_SUBDOMAIN - function: BASE64_DECODE_TO_STRING output_fields: [mail_subject] diff --git a/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/datacenter_dt/etl_session_record_kafka_to_ndc_kafka b/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/datacenter_dt/etl_session_record_kafka_to_ndc_kafka index b852127..07b7e74 100644 --- a/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/datacenter_dt/etl_session_record_kafka_to_ndc_kafka +++ b/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/datacenter_dt/etl_session_record_kafka_to_ndc_kafka @@ -73,12 +73,6 @@ processing_pipelines: parameters: value_expression: recv_time - - function: DOMAIN - lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni] - output_fields: [server_domain] - parameters: - option: FIRST_SIGNIFICANT_SUBDOMAIN - - function: BASE64_DECODE_TO_STRING output_fields: [mail_subject] parameters: diff --git a/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/datacenter_dt/etl_transaction_record_kafka_to_ndc_kafka b/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/datacenter_dt/etl_transaction_record_kafka_to_ndc_kafka index 9e7c869..bbc0353 100644 --- a/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/datacenter_dt/etl_transaction_record_kafka_to_ndc_kafka +++ b/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/datacenter_dt/etl_transaction_record_kafka_to_ndc_kafka @@ -77,12 +77,6 @@ processing_pipelines: parameters: value_expression: recv_time - - function: DOMAIN - lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni] - output_fields: [server_domain] - parameters: - option: FIRST_SIGNIFICANT_SUBDOMAIN - - function: BASE64_DECODE_TO_STRING output_fields: [mail_subject] parameters: diff --git a/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/national_datacenter/etl_voip_record_kafka_to_clickhouse b/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/national_datacenter/etl_voip_record_kafka_to_clickhouse index da87056..da47689 100644 --- a/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/national_datacenter/etl_voip_record_kafka_to_clickhouse +++ b/tsg_olap/installation/flink/groot_stream/multi-datacenter-examples/national_datacenter/etl_voip_record_kafka_to_clickhouse @@ -73,11 +73,6 @@ processing_pipelines: parameters: value_expression: recv_time - - function: DOMAIN - lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni] - output_fields: [server_domain] - parameters: - option: FIRST_SIGNIFICANT_SUBDOMAIN - function: BASE64_DECODE_TO_STRING output_fields: [mail_subject] diff --git a/tsg_olap/installation/flink/groot_stream/single-cluster-examples/dos_event_kafka_to_clickhouse b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/dos_event_kafka_to_clickhouse index b87db67..49a3d1a 100644 --- a/tsg_olap/installation/flink/groot_stream/single-cluster-examples/dos_event_kafka_to_clickhouse +++ b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/dos_event_kafka_to_clickhouse @@ -3,7 +3,7 @@ sources: type: kafka properties: topic: DOS-EVENT - kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094 + kafka.bootstrap.servers: "{{ kafka_source_servers }}" kafka.session.timeout.ms: 60000 kafka.max.poll.records: 3000 kafka.max.partition.fetch.bytes: 31457280 @@ -27,7 +27,7 @@ sinks: clickhouse_sink: type: clickhouse properties: - host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001 + host: "{{ clickhouse_servers }}" table: tsg_galaxy_v3.dos_event_local batch.size: 100000 batch.interval: 30s diff --git a/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_datapath_telemetry_record_kafka_to_clickhouse b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_datapath_telemetry_record_kafka_to_clickhouse index 14655a6..8ffbda3 100644 --- a/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_datapath_telemetry_record_kafka_to_clickhouse +++ b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_datapath_telemetry_record_kafka_to_clickhouse @@ -3,7 +3,7 @@ sources: type: kafka properties: topic: DATAPATH-TELEMETRY-RECORD - kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094 + kafka.bootstrap.servers: "{{ kafka_source_servers }}" kafka.session.timeout.ms: 60000 kafka.max.poll.records: 3000 kafka.max.partition.fetch.bytes: 31457280 @@ -46,7 +46,7 @@ sinks: clickhouse_sink: type: clickhouse properties: - host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001 + host: "{{ clickhouse_servers }}" table: tsg_galaxy_v3.datapath_telemetry_record_local batch.size: 5000 batch.interval: 30s diff --git a/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_proxy_event_kafka_to_clickhouse b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_proxy_event_kafka_to_clickhouse index 39ab825..19e4560 100644 --- a/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_proxy_event_kafka_to_clickhouse +++ b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_proxy_event_kafka_to_clickhouse @@ -7,7 +7,7 @@ sources: # watermark_lag: 60 # [number] Watermark Lag, default is 60 properties: topic: PROXY-EVENT - kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094 + kafka.bootstrap.servers: "{{ kafka_source_servers }}" kafka.session.timeout.ms: 60000 kafka.max.poll.records: 3000 kafka.max.partition.fetch.bytes: 31457280 @@ -71,12 +71,6 @@ processing_pipelines: parameters: value_expression: recv_time - - function: DOMAIN - lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni] - output_fields: [server_domain] - parameters: - option: FIRST_SIGNIFICANT_SUBDOMAIN - - function: BASE64_DECODE_TO_STRING output_fields: [mail_subject] parameters: @@ -123,7 +117,7 @@ sinks: clickhouse_sink: type: clickhouse properties: - host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001 + host: "{{ clickhouse_servers }}" table: tsg_galaxy_v3.proxy_event_local batch.size: 100000 batch.interval: 30s diff --git a/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_session_record_kafka_to_clickhouse b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_session_record_kafka_to_clickhouse index 643fa48..ada2632 100644 --- a/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_session_record_kafka_to_clickhouse +++ b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_session_record_kafka_to_clickhouse @@ -7,7 +7,7 @@ sources: # watermark_lag: 60 # [number] Watermark Lag, default is 60 properties: topic: SESSION-RECORD - kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094 + kafka.bootstrap.servers: "{{ kafka_source_servers }}" kafka.session.timeout.ms: 60000 kafka.max.poll.records: 3000 kafka.max.partition.fetch.bytes: 31457280 @@ -71,12 +71,6 @@ processing_pipelines: parameters: value_expression: recv_time - - function: DOMAIN - lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni] - output_fields: [server_domain] - parameters: - option: FIRST_SIGNIFICANT_SUBDOMAIN - - function: BASE64_DECODE_TO_STRING output_fields: [mail_subject] parameters: @@ -123,7 +117,7 @@ sinks: clickhouse_sink: type: clickhouse properties: - host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001 + host: "{{ clickhouse_servers }}" table: tsg_galaxy_v3.session_record_local batch.size: 100000 batch.interval: 30s diff --git a/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_traffic_sketch_metric_kafka_to_clickhouse b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_traffic_sketch_metric_kafka_to_clickhouse new file mode 100644 index 0000000..016650c --- /dev/null +++ b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_traffic_sketch_metric_kafka_to_clickhouse @@ -0,0 +1,92 @@ +sources: + kafka_source: + type: kafka + properties: + topic: TRAFFIC-SKETCH-METRIC + kafka.bootstrap.servers: "{{ kafka_source_servers }}" + kafka.session.timeout.ms: 60000 + kafka.max.poll.records: 3000 + kafka.max.partition.fetch.bytes: 31457280 + kafka.security.protocol: SASL_PLAINTEXT + kafka.sasl.mechanism: PLAIN + kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817 + kafka.group.id: etl_traffic_sketch_metric + kafka.auto.offset.reset: latest + kafka.compression.type: none + format: json + +processing_pipelines: + etl_processor: # [object] Processing Pipeline + type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + remove_fields: + output_fields: + functions: # [array of object] Function List + + - function: FLATTEN + lookup_fields: [ fields,tags ] + output_fields: [ ] + parameters: + #prefix: "" + depth: 3 + # delimiter: "." + + - function: RENAME + lookup_fields: [ '' ] + output_fields: [ '' ] + filter: + parameters: + # parent_fields: [tags] + #rename_fields: + # tags: tags + rename_expression: key =string.replace_all(key,'tags.','');key =string.replace_all(key,'fields.','');return key; + + - function: EVAL + output_fields: [ internal_ip ] + parameters: + value_expression: 'direction=Outbound? client_ip : server_ip' + - function: EVAL + output_fields: [ external_ip ] + parameters: + value_expression: 'direction=Outbound? server_ip : client_ip' + + - function: UNIX_TIMESTAMP_CONVERTER + lookup_fields: [ timestamp_ms ] + output_fields: [ recv_time ] + parameters: + precision: seconds + + - function: SNOWFLAKE_ID + lookup_fields: [ '' ] + output_fields: [ log_id ] + filter: + parameters: + data_center_id_num: 1 + + +sinks: + clickhouse_sink: + type: clickhouse + properties: + host: "{{ clickhouse_servers }}" + table: tsg_galaxy_v3.traffic_sketch_metric_local + batch.size: 100000 + batch.interval: 30s + connection.user: e54c9568586180eede1506eecf3574e9 + connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e + + +application: + + env: # [object] Environment Variables + name: etl_traffic_sketch_metric # [string] Job Name + shade.identifier: aes + pipeline: + object-reuse: true # [boolean] Object Reuse, default is false + topology: + - name: kafka_source + downstream: [etl_processor] + - name: etl_processor + downstream: [clickhouse_sink] + - name: clickhouse_sink + + diff --git a/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_transaction_record_kafka_to_clickhouse b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_transaction_record_kafka_to_clickhouse index 4d28714..d4dbd87 100644 --- a/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_transaction_record_kafka_to_clickhouse +++ b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_transaction_record_kafka_to_clickhouse @@ -7,7 +7,7 @@ sources: # watermark_lag: 60 # [number] Watermark Lag, default is 60 properties: topic: TRANSACTION-RECORD - kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094 + kafka.bootstrap.servers: "{{ kafka_source_servers }}" kafka.session.timeout.ms: 60000 kafka.max.poll.records: 3000 kafka.max.partition.fetch.bytes: 31457280 @@ -71,12 +71,6 @@ processing_pipelines: parameters: value_expression: recv_time - - function: DOMAIN - lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni] - output_fields: [server_domain] - parameters: - option: FIRST_SIGNIFICANT_SUBDOMAIN - - function: BASE64_DECODE_TO_STRING output_fields: [mail_subject] parameters: @@ -123,7 +117,7 @@ sinks: clickhouse_sink: type: clickhouse properties: - host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001 + host: "{{ clickhouse_servers }}" table: tsg_galaxy_v3.transaction_record_local batch.size: 100000 batch.interval: 30s diff --git a/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_voip_record_kafka_to_clickhouse b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_voip_record_kafka_to_clickhouse index 90d3179..d798e94 100644 --- a/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_voip_record_kafka_to_clickhouse +++ b/tsg_olap/installation/flink/groot_stream/single-cluster-examples/etl_voip_record_kafka_to_clickhouse @@ -7,7 +7,7 @@ sources: # watermark_lag: 60 # [number] Watermark Lag, default is 60 properties: topic: VOIP-CONVERSATION-RECORD - kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094 + kafka.bootstrap.servers: "{{ kafka_source_servers }}" kafka.session.timeout.ms: 60000 kafka.max.poll.records: 3000 kafka.max.partition.fetch.bytes: 31457280 @@ -71,12 +71,6 @@ processing_pipelines: parameters: value_expression: recv_time - - function: DOMAIN - lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni] - output_fields: [server_domain] - parameters: - option: FIRST_SIGNIFICANT_SUBDOMAIN - - function: BASE64_DECODE_TO_STRING output_fields: [mail_subject] parameters: @@ -123,7 +117,7 @@ sinks: clickhouse_sink: type: clickhouse properties: - host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001 + host: "{{ clickhouse_servers }}" table: tsg_galaxy_v3.voip_record_local batch.size: 100000 batch.interval: 30s diff --git a/tsg_olap/upgrade/TSG-24.06/groot_stream/multi-datacenter-examples/datacenter_dt/etl_proxy_event_kafka_to_ndc_kafka b/tsg_olap/upgrade/TSG-24.06/groot_stream/multi-datacenter-examples/datacenter_dt/etl_proxy_event_kafka_to_ndc_kafka new file mode 100644 index 0000000..1760cf3 --- /dev/null +++ b/tsg_olap/upgrade/TSG-24.06/groot_stream/multi-datacenter-examples/datacenter_dt/etl_proxy_event_kafka_to_ndc_kafka @@ -0,0 +1,152 @@ +sources: + kafka_source: + type: kafka + properties: + topic: PROXY-EVENT + kafka.bootstrap.servers: "{{ kafka_source_servers }}" + kafka.session.timeout.ms: 60000 + kafka.max.poll.records: 3000 + kafka.max.partition.fetch.bytes: 31457280 + kafka.security.protocol: SASL_PLAINTEXT + kafka.sasl.mechanism: PLAIN + kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817 + kafka.group.id: etl_proxy_event_kafka_to_ndc_kafka + kafka.auto.offset.reset: latest + format: json + +processing_pipelines: + etl_processor: # [object] Processing Pipeline + type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + remove_fields: + output_fields: + properties: + key: value + functions: # [array of object] Function List + + - function: ASN_LOOKUP + lookup_fields: [server_ip] + output_fields: [server_asn] + parameters: + option: IP_TO_ASN + kb_name: tsg_ip_asn + + - function: ASN_LOOKUP + lookup_fields: [client_ip] + output_fields: [client_asn] + parameters: + option: IP_TO_ASN + kb_name: tsg_ip_asn + + - function: SNOWFLAKE_ID + lookup_fields: [''] + output_fields: [log_id] + parameters: + data_center_id_num: 1 + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [data_center] + filter: + parameters: + value_expression: $.tags[?(@.tag=='data_center')][0].value + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [device_group] + filter: + parameters: + value_expression: $.tags[?(@.tag=='device_group')][0].value + + - function: CURRENT_UNIX_TIMESTAMP + output_fields: [processing_time] + parameters: + precision: seconds + + - function: UNIX_TIMESTAMP_CONVERTER + lookup_fields: [__timestamp] + output_fields: [recv_time] + parameters: + precision: seconds + + - function: EVAL + output_fields: [ingestion_time] + parameters: + value_expression: recv_time + + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_subject] + parameters: + value_field: mail_subject + charset_field: mail_subject_charset + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_attachment_name] + parameters: + value_field: mail_attachment_name + charset_field: mail_attachment_name_charset + + - function: PATH_COMBINE + lookup_fields: [rtp_pcap_path] + output_fields: [rtp_pcap_path] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path] + + - function: PATH_COMBINE + lookup_fields: [http_request_body] + output_fields: [http_request_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body] + + - function: PATH_COMBINE + lookup_fields: [http_response_body] + output_fields: [http_response_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body] + + - function: PATH_COMBINE + lookup_fields: [mail_eml_file] + output_fields: [mail_eml_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file] + + - function: PATH_COMBINE + lookup_fields: [packet_capture_file] + output_fields: [packet_capture_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file] + +sinks: + kafka_sink: + type : kafka + properties: + topic: PROXY-EVENT-PROCESSED + kafka.bootstrap.servers: "{{ kafka_sink_servers }}" + kafka.retries: 0 + kafka.linger.ms: 10 + kafka.request.timeout.ms: 30000 + kafka.batch.size: 262144 + kafka.buffer.memory: 134217728 + kafka.max.request.size: 10485760 + kafka.compression.type: snappy + kafka.security.protocol: SASL_PLAINTEXT + kafka.sasl.mechanism: PLAIN + kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817 + format: json + + +application: + + env: # [object] Environment Variables + name: etl_proxy_event_kafka_to_ndc_kafka # [string] Job Name + shade.identifier: aes + pipeline: + object-reuse: true # [boolean] Object Reuse, default is false + topology: + - name: kafka_source + downstream: [etl_processor] + - name: etl_processor + downstream: [kafka_sink] + - name: kafka_sink + + diff --git a/tsg_olap/upgrade/TSG-24.06/groot_stream/multi-datacenter-examples/datacenter_dt/etl_session_record_kafka_to_ndc_kafka b/tsg_olap/upgrade/TSG-24.06/groot_stream/multi-datacenter-examples/datacenter_dt/etl_session_record_kafka_to_ndc_kafka new file mode 100644 index 0000000..07b7e74 --- /dev/null +++ b/tsg_olap/upgrade/TSG-24.06/groot_stream/multi-datacenter-examples/datacenter_dt/etl_session_record_kafka_to_ndc_kafka @@ -0,0 +1,151 @@ +sources: + kafka_source: + type: kafka + properties: + topic: SESSION-RECORD + kafka.bootstrap.servers: "{{ kafka_source_servers }}" + kafka.session.timeout.ms: 60000 + kafka.max.poll.records: 3000 + kafka.max.partition.fetch.bytes: 31457280 + kafka.security.protocol: SASL_PLAINTEXT + kafka.sasl.mechanism: PLAIN + kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817 + kafka.group.id: etl_session_record_kafka_to_ndc_kafka + kafka.auto.offset.reset: latest + format: json + +processing_pipelines: + etl_processor: # [object] Processing Pipeline + type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + remove_fields: + output_fields: + properties: + key: value + functions: # [array of object] Function List + + - function: ASN_LOOKUP + lookup_fields: [server_ip] + output_fields: [server_asn] + parameters: + option: IP_TO_ASN + kb_name: tsg_ip_asn + + - function: ASN_LOOKUP + lookup_fields: [client_ip] + output_fields: [client_asn] + parameters: + option: IP_TO_ASN + kb_name: tsg_ip_asn + + - function: SNOWFLAKE_ID + lookup_fields: [''] + output_fields: [log_id] + parameters: + data_center_id_num: 1 + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [data_center] + filter: + parameters: + value_expression: $.tags[?(@.tag=='data_center')][0].value + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [device_group] + filter: + parameters: + value_expression: $.tags[?(@.tag=='device_group')][0].value + + - function: CURRENT_UNIX_TIMESTAMP + output_fields: [processing_time] + parameters: + precision: seconds + + - function: UNIX_TIMESTAMP_CONVERTER + lookup_fields: [__timestamp] + output_fields: [recv_time] + parameters: + precision: seconds + + - function: EVAL + output_fields: [ingestion_time] + parameters: + value_expression: recv_time + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_subject] + parameters: + value_field: mail_subject + charset_field: mail_subject_charset + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_attachment_name] + parameters: + value_field: mail_attachment_name + charset_field: mail_attachment_name_charset + + - function: PATH_COMBINE + lookup_fields: [rtp_pcap_path] + output_fields: [rtp_pcap_path] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path] + + - function: PATH_COMBINE + lookup_fields: [http_request_body] + output_fields: [http_request_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body] + + - function: PATH_COMBINE + lookup_fields: [http_response_body] + output_fields: [http_response_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body] + + - function: PATH_COMBINE + lookup_fields: [mail_eml_file] + output_fields: [mail_eml_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file] + + - function: PATH_COMBINE + lookup_fields: [packet_capture_file] + output_fields: [packet_capture_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file] + +sinks: + kafka_sink: + type : kafka + properties: + topic: SESSION-RECORD-PROCESSED + kafka.bootstrap.servers: "{{ kafka_sink_servers }}" + kafka.retries: 0 + kafka.linger.ms: 10 + kafka.request.timeout.ms: 30000 + kafka.batch.size: 262144 + kafka.buffer.memory: 134217728 + kafka.max.request.size: 10485760 + kafka.compression.type: snappy + kafka.security.protocol: SASL_PLAINTEXT + kafka.sasl.mechanism: PLAIN + kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817 + format: json + + +application: + + env: # [object] Environment Variables + name: etl_session_record_kafka_to_ndc_kafka # [string] Job Name + shade.identifier: aes + pipeline: + object-reuse: true # [boolean] Object Reuse, default is false + topology: + - name: kafka_source + downstream: [etl_processor] + - name: etl_processor + downstream: [kafka_sink] + - name: kafka_sink + + diff --git a/tsg_olap/upgrade/TSG-24.06/groot_stream/multi-datacenter-examples/datacenter_dt/etl_transaction_record_kafka_to_ndc_kafka b/tsg_olap/upgrade/TSG-24.06/groot_stream/multi-datacenter-examples/datacenter_dt/etl_transaction_record_kafka_to_ndc_kafka new file mode 100644 index 0000000..bbc0353 --- /dev/null +++ b/tsg_olap/upgrade/TSG-24.06/groot_stream/multi-datacenter-examples/datacenter_dt/etl_transaction_record_kafka_to_ndc_kafka @@ -0,0 +1,155 @@ +sources: + kafka_source: + type: kafka + # fields: # [array of object] Field List, if not set, all fields(Map) will be output. + # watermark_timestamp: common_recv_time # [string] Watermark Field Name + # watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms + # watermark_lag: 60 # [number] Watermark Lag, default is 60 + properties: + topic: TRANSACTION-RECORD + kafka.bootstrap.servers: "{{ kafka_source_servers }}" + kafka.session.timeout.ms: 60000 + kafka.max.poll.records: 3000 + kafka.max.partition.fetch.bytes: 31457280 + kafka.security.protocol: SASL_PLAINTEXT + kafka.sasl.mechanism: PLAIN + kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817 + kafka.group.id: etl_transaction_record_kafka_to_ndc_kafka + kafka.auto.offset.reset: latest + format: json + +processing_pipelines: + etl_processor: # [object] Processing Pipeline + type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + remove_fields: + output_fields: + properties: + key: value + functions: # [array of object] Function List + + - function: ASN_LOOKUP + lookup_fields: [server_ip] + output_fields: [server_asn] + parameters: + option: IP_TO_ASN + kb_name: tsg_ip_asn + + - function: ASN_LOOKUP + lookup_fields: [client_ip] + output_fields: [client_asn] + parameters: + option: IP_TO_ASN + kb_name: tsg_ip_asn + + - function: SNOWFLAKE_ID + lookup_fields: [''] + output_fields: [log_id] + parameters: + data_center_id_num: 1 + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [data_center] + filter: + parameters: + value_expression: $.tags[?(@.tag=='data_center')][0].value + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [device_group] + filter: + parameters: + value_expression: $.tags[?(@.tag=='device_group')][0].value + + - function: CURRENT_UNIX_TIMESTAMP + output_fields: [processing_time] + parameters: + precision: seconds + + - function: UNIX_TIMESTAMP_CONVERTER + lookup_fields: [__timestamp] + output_fields: [recv_time] + parameters: + precision: seconds + + - function: EVAL + output_fields: [ingestion_time] + parameters: + value_expression: recv_time + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_subject] + parameters: + value_field: mail_subject + charset_field: mail_subject_charset + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_attachment_name] + parameters: + value_field: mail_attachment_name + charset_field: mail_attachment_name_charset + + - function: PATH_COMBINE + lookup_fields: [rtp_pcap_path] + output_fields: [rtp_pcap_path] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path] + + - function: PATH_COMBINE + lookup_fields: [http_request_body] + output_fields: [http_request_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body] + + - function: PATH_COMBINE + lookup_fields: [http_response_body] + output_fields: [http_response_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body] + + - function: PATH_COMBINE + lookup_fields: [mail_eml_file] + output_fields: [mail_eml_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file] + + - function: PATH_COMBINE + lookup_fields: [packet_capture_file] + output_fields: [packet_capture_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file] + +sinks: + kafka_sink: + type : kafka + properties: + topic: TRANSACTION-RECORD-PROCESSED + kafka.bootstrap.servers: "{{ kafka_sink_servers }}" + kafka.retries: 0 + kafka.linger.ms: 10 + kafka.request.timeout.ms: 30000 + kafka.batch.size: 262144 + kafka.buffer.memory: 134217728 + kafka.max.request.size: 10485760 + kafka.compression.type: snappy + kafka.security.protocol: SASL_PLAINTEXT + kafka.sasl.mechanism: PLAIN + kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817 + format: json + + +application: + + env: # [object] Environment Variables + name: etl_transaction_record_kafka_to_ndc_kafka # [string] Job Name + shade.identifier: aes + pipeline: + object-reuse: true # [boolean] Object Reuse, default is false + topology: + - name: kafka_source + downstream: [etl_processor] + - name: etl_processor + downstream: [kafka_sink] + - name: kafka_sink + + diff --git a/tsg_olap/upgrade/TSG-24.06/groot_stream/multi-datacenter-examples/national_datacenter/etl_voip_record_kafka_to_clickhouse b/tsg_olap/upgrade/TSG-24.06/groot_stream/multi-datacenter-examples/national_datacenter/etl_voip_record_kafka_to_clickhouse new file mode 100644 index 0000000..da47689 --- /dev/null +++ b/tsg_olap/upgrade/TSG-24.06/groot_stream/multi-datacenter-examples/national_datacenter/etl_voip_record_kafka_to_clickhouse @@ -0,0 +1,118 @@ +sources: + kafka_source: + type: kafka + properties: + topic: VOIP-CONVERSATION-RECORD + kafka.bootstrap.servers: "{{ kafka_source_servers }}" + kafka.session.timeout.ms: 60000 + kafka.max.poll.records: 3000 + kafka.max.partition.fetch.bytes: 31457280 + kafka.security.protocol: SASL_PLAINTEXT + kafka.sasl.mechanism: PLAIN + kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817 + kafka.group.id: etl_voip_record_kafka_to_clickhouse + kafka.auto.offset.reset: latest + format: json + +processing_pipelines: + etl_processor: # [object] Processing Pipeline + type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + remove_fields: + output_fields: + properties: + key: value + functions: # [array of object] Function List + + - function: ASN_LOOKUP + lookup_fields: [server_ip] + output_fields: [server_asn] + parameters: + option: IP_TO_ASN + kb_name: tsg_ip_asn + + - function: ASN_LOOKUP + lookup_fields: [client_ip] + output_fields: [client_asn] + parameters: + option: IP_TO_ASN + kb_name: tsg_ip_asn + + - function: SNOWFLAKE_ID + lookup_fields: [''] + output_fields: [log_id] + parameters: + data_center_id_num: 1 + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [data_center] + filter: + parameters: + value_expression: $.tags[?(@.tag=='data_center')][0].value + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [device_group] + filter: + parameters: + value_expression: $.tags[?(@.tag=='device_group')][0].value + + - function: CURRENT_UNIX_TIMESTAMP + output_fields: [processing_time] + parameters: + precision: seconds + + - function: UNIX_TIMESTAMP_CONVERTER + lookup_fields: [__timestamp] + output_fields: [recv_time] + parameters: + precision: seconds + + - function: EVAL + output_fields: [ingestion_time] + parameters: + value_expression: recv_time + + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_subject] + parameters: + value_field: mail_subject + charset_field: mail_subject_charset + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_attachment_name] + parameters: + value_field: mail_attachment_name + charset_field: mail_attachment_name_charset + + +sinks: + clickhouse_sink: + type: clickhouse + properties: + host: "{{ clickhouse_servers }}" + table: tsg_galaxy_v3.voip_record_local + batch.size: 100000 + batch.interval: 30s + connection.user: e54c9568586180eede1506eecf3574e9 + connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e + connection.connect_timeout: 30 + connection.query_timeout: 300 + + +application: + + env: # [object] Environment Variables + name: etl_voip_record_kafka_to_clickhouse # [string] Job Name + shade.identifier: aes + pipeline: + object-reuse: true # [boolean] Object Reuse, default is false + topology: + - name: kafka_source + downstream: [etl_processor] + - name: etl_processor + downstream: [clickhouse_sink] + - name: clickhouse_sink + + diff --git a/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_proxy_event_kafka_to_clickhouse b/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_proxy_event_kafka_to_clickhouse new file mode 100644 index 0000000..19e4560 --- /dev/null +++ b/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_proxy_event_kafka_to_clickhouse @@ -0,0 +1,142 @@ +sources: + kafka_source: + type: kafka + # fields: # [array of object] Field List, if not set, all fields(Map) will be output. + # watermark_timestamp: common_recv_time # [string] Watermark Field Name + # watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms + # watermark_lag: 60 # [number] Watermark Lag, default is 60 + properties: + topic: PROXY-EVENT + kafka.bootstrap.servers: "{{ kafka_source_servers }}" + kafka.session.timeout.ms: 60000 + kafka.max.poll.records: 3000 + kafka.max.partition.fetch.bytes: 31457280 + kafka.security.protocol: SASL_PLAINTEXT + kafka.ssl.keystore.location: + kafka.ssl.keystore.password: + kafka.ssl.truststore.location: + kafka.ssl.truststore.password: + kafka.ssl.key.password: + kafka.sasl.mechanism: PLAIN + kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817 + kafka.buffer.memory: + kafka.group.id: etl_proxy_event_kafka_to_clickhouse-20231221 + kafka.auto.offset.reset: latest + kafka.max.request.size: + kafka.compression.type: none + format: json + +processing_pipelines: + etl_processor: # [object] Processing Pipeline + type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + remove_fields: + output_fields: + properties: + key: value + functions: # [array of object] Function List + + - function: SNOWFLAKE_ID + lookup_fields: [''] + output_fields: [log_id] + parameters: + data_center_id_num: 1 + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [data_center] + filter: + parameters: + value_expression: $.tags[?(@.tag=='data_center')][0].value + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [device_group] + filter: + parameters: + value_expression: $.tags[?(@.tag=='device_group')][0].value + + - function: CURRENT_UNIX_TIMESTAMP + output_fields: [processing_time] + parameters: + precision: seconds + + - function: UNIX_TIMESTAMP_CONVERTER + lookup_fields: [__timestamp] + output_fields: [recv_time] + parameters: + precision: seconds + + - function: EVAL + output_fields: [ingestion_time] + parameters: + value_expression: recv_time + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_subject] + parameters: + value_field: mail_subject + charset_field: mail_subject_charset + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_attachment_name] + parameters: + value_field: mail_attachment_name + charset_field: mail_attachment_name_charset + + - function: PATH_COMBINE + lookup_fields: [rtp_pcap_path] + output_fields: [rtp_pcap_path] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path] + + - function: PATH_COMBINE + lookup_fields: [http_request_body] + output_fields: [http_request_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body] + + - function: PATH_COMBINE + lookup_fields: [http_response_body] + output_fields: [http_response_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body] + + - function: PATH_COMBINE + lookup_fields: [mail_eml_file] + output_fields: [mail_eml_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file] + + - function: PATH_COMBINE + lookup_fields: [packet_capture_file] + output_fields: [packet_capture_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file] + +sinks: + clickhouse_sink: + type: clickhouse + properties: + host: "{{ clickhouse_servers }}" + table: tsg_galaxy_v3.proxy_event_local + batch.size: 100000 + batch.interval: 30s + connection.user: e54c9568586180eede1506eecf3574e9 + connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e + + +application: + + env: # [object] Environment Variables + name: etl_proxy_event_kafka_to_clickhouse # [string] Job Name + shade.identifier: aes + pipeline: + object-reuse: true # [boolean] Object Reuse, default is false + topology: + - name: kafka_source + downstream: [etl_processor] + - name: etl_processor + downstream: [clickhouse_sink] + - name: clickhouse_sink + + diff --git a/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_session_record_kafka_to_clickhouse b/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_session_record_kafka_to_clickhouse new file mode 100644 index 0000000..ada2632 --- /dev/null +++ b/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_session_record_kafka_to_clickhouse @@ -0,0 +1,142 @@ +sources: + kafka_source: + type: kafka + # fields: # [array of object] Field List, if not set, all fields(Map) will be output. + # watermark_timestamp: common_recv_time # [string] Watermark Field Name + # watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms + # watermark_lag: 60 # [number] Watermark Lag, default is 60 + properties: + topic: SESSION-RECORD + kafka.bootstrap.servers: "{{ kafka_source_servers }}" + kafka.session.timeout.ms: 60000 + kafka.max.poll.records: 3000 + kafka.max.partition.fetch.bytes: 31457280 + kafka.security.protocol: SASL_PLAINTEXT + kafka.ssl.keystore.location: + kafka.ssl.keystore.password: + kafka.ssl.truststore.location: + kafka.ssl.truststore.password: + kafka.ssl.key.password: + kafka.sasl.mechanism: PLAIN + kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817 + kafka.buffer.memory: + kafka.group.id: etl_session_record_kafka_to_clickhouse-20230125 + kafka.auto.offset.reset: latest + kafka.max.request.size: + kafka.compression.type: none + format: json + +processing_pipelines: + etl_processor: # [object] Processing Pipeline + type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + remove_fields: + output_fields: + properties: + key: value + functions: # [array of object] Function List + + - function: SNOWFLAKE_ID + lookup_fields: [''] + output_fields: [log_id] + parameters: + data_center_id_num: 1 + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [data_center] + filter: + parameters: + value_expression: $.tags[?(@.tag=='data_center')][0].value + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [device_group] + filter: + parameters: + value_expression: $.tags[?(@.tag=='device_group')][0].value + + - function: CURRENT_UNIX_TIMESTAMP + output_fields: [processing_time] + parameters: + precision: seconds + + - function: UNIX_TIMESTAMP_CONVERTER + lookup_fields: [__timestamp] + output_fields: [recv_time] + parameters: + precision: seconds + + - function: EVAL + output_fields: [ingestion_time] + parameters: + value_expression: recv_time + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_subject] + parameters: + value_field: mail_subject + charset_field: mail_subject_charset + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_attachment_name] + parameters: + value_field: mail_attachment_name + charset_field: mail_attachment_name_charset + + - function: PATH_COMBINE + lookup_fields: [rtp_pcap_path] + output_fields: [rtp_pcap_path] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path] + + - function: PATH_COMBINE + lookup_fields: [http_request_body] + output_fields: [http_request_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body] + + - function: PATH_COMBINE + lookup_fields: [http_response_body] + output_fields: [http_response_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body] + + - function: PATH_COMBINE + lookup_fields: [mail_eml_file] + output_fields: [mail_eml_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file] + + - function: PATH_COMBINE + lookup_fields: [packet_capture_file] + output_fields: [packet_capture_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file] + +sinks: + clickhouse_sink: + type: clickhouse + properties: + host: "{{ clickhouse_servers }}" + table: tsg_galaxy_v3.session_record_local + batch.size: 100000 + batch.interval: 30s + connection.user: e54c9568586180eede1506eecf3574e9 + connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e + + +application: + + env: # [object] Environment Variables + name: etl_session_record_kafka_to_clickhouse # [string] Job Name + shade.identifier: aes + pipeline: + object-reuse: true # [boolean] Object Reuse, default is false + topology: + - name: kafka_source + downstream: [etl_processor] + - name: etl_processor + downstream: [clickhouse_sink] + - name: clickhouse_sink + + diff --git a/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_traffic_sketch_metric_kafka_to_clickhouse b/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_traffic_sketch_metric_kafka_to_clickhouse new file mode 100644 index 0000000..016650c --- /dev/null +++ b/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_traffic_sketch_metric_kafka_to_clickhouse @@ -0,0 +1,92 @@ +sources: + kafka_source: + type: kafka + properties: + topic: TRAFFIC-SKETCH-METRIC + kafka.bootstrap.servers: "{{ kafka_source_servers }}" + kafka.session.timeout.ms: 60000 + kafka.max.poll.records: 3000 + kafka.max.partition.fetch.bytes: 31457280 + kafka.security.protocol: SASL_PLAINTEXT + kafka.sasl.mechanism: PLAIN + kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817 + kafka.group.id: etl_traffic_sketch_metric + kafka.auto.offset.reset: latest + kafka.compression.type: none + format: json + +processing_pipelines: + etl_processor: # [object] Processing Pipeline + type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + remove_fields: + output_fields: + functions: # [array of object] Function List + + - function: FLATTEN + lookup_fields: [ fields,tags ] + output_fields: [ ] + parameters: + #prefix: "" + depth: 3 + # delimiter: "." + + - function: RENAME + lookup_fields: [ '' ] + output_fields: [ '' ] + filter: + parameters: + # parent_fields: [tags] + #rename_fields: + # tags: tags + rename_expression: key =string.replace_all(key,'tags.','');key =string.replace_all(key,'fields.','');return key; + + - function: EVAL + output_fields: [ internal_ip ] + parameters: + value_expression: 'direction=Outbound? client_ip : server_ip' + - function: EVAL + output_fields: [ external_ip ] + parameters: + value_expression: 'direction=Outbound? server_ip : client_ip' + + - function: UNIX_TIMESTAMP_CONVERTER + lookup_fields: [ timestamp_ms ] + output_fields: [ recv_time ] + parameters: + precision: seconds + + - function: SNOWFLAKE_ID + lookup_fields: [ '' ] + output_fields: [ log_id ] + filter: + parameters: + data_center_id_num: 1 + + +sinks: + clickhouse_sink: + type: clickhouse + properties: + host: "{{ clickhouse_servers }}" + table: tsg_galaxy_v3.traffic_sketch_metric_local + batch.size: 100000 + batch.interval: 30s + connection.user: e54c9568586180eede1506eecf3574e9 + connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e + + +application: + + env: # [object] Environment Variables + name: etl_traffic_sketch_metric # [string] Job Name + shade.identifier: aes + pipeline: + object-reuse: true # [boolean] Object Reuse, default is false + topology: + - name: kafka_source + downstream: [etl_processor] + - name: etl_processor + downstream: [clickhouse_sink] + - name: clickhouse_sink + + diff --git a/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_transaction_record_kafka_to_clickhouse b/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_transaction_record_kafka_to_clickhouse new file mode 100644 index 0000000..d4dbd87 --- /dev/null +++ b/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_transaction_record_kafka_to_clickhouse @@ -0,0 +1,140 @@ +sources: + kafka_source: + type: kafka + # fields: # [array of object] Field List, if not set, all fields(Map) will be output. + # watermark_timestamp: common_recv_time # [string] Watermark Field Name + # watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms + # watermark_lag: 60 # [number] Watermark Lag, default is 60 + properties: + topic: TRANSACTION-RECORD + kafka.bootstrap.servers: "{{ kafka_source_servers }}" + kafka.session.timeout.ms: 60000 + kafka.max.poll.records: 3000 + kafka.max.partition.fetch.bytes: 31457280 + kafka.security.protocol: SASL_PLAINTEXT + kafka.ssl.keystore.location: + kafka.ssl.keystore.password: + kafka.ssl.truststore.location: + kafka.ssl.truststore.password: + kafka.ssl.key.password: + kafka.sasl.mechanism: PLAIN + kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817 + kafka.buffer.memory: + kafka.group.id: etl_transaction_record_kafka_to_clickhouse-20240308 + kafka.auto.offset.reset: latest + kafka.max.request.size: + kafka.compression.type: none + format: json + +processing_pipelines: + etl_processor: # [object] Processing Pipeline + type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + remove_fields: + output_fields: + properties: + key: value + functions: # [array of object] Function List + + - function: SNOWFLAKE_ID + lookup_fields: [''] + output_fields: [log_id] + parameters: + data_center_id_num: 1 + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [data_center] + filter: + parameters: + value_expression: $.tags[?(@.tag=='data_center')][0].value + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [device_group] + filter: + parameters: + value_expression: $.tags[?(@.tag=='device_group')][0].value + + - function: CURRENT_UNIX_TIMESTAMP + output_fields: [processing_time] + parameters: + precision: seconds + + - function: UNIX_TIMESTAMP_CONVERTER + lookup_fields: [__timestamp] + output_fields: [recv_time] + parameters: + precision: seconds + + - function: EVAL + output_fields: [ingestion_time] + parameters: + value_expression: recv_time + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_subject] + parameters: + value_field: mail_subject + charset_field: mail_subject_charset + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_attachment_name] + parameters: + value_field: mail_attachment_name + charset_field: mail_attachment_name_charset + + - function: PATH_COMBINE + lookup_fields: [rtp_pcap_path] + output_fields: [rtp_pcap_path] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path] + + - function: PATH_COMBINE + lookup_fields: [http_request_body] + output_fields: [http_request_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body] + + - function: PATH_COMBINE + lookup_fields: [http_response_body] + output_fields: [http_response_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body] + + - function: PATH_COMBINE + lookup_fields: [mail_eml_file] + output_fields: [mail_eml_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file] + + - function: PATH_COMBINE + lookup_fields: [packet_capture_file] + output_fields: [packet_capture_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file] + +sinks: + clickhouse_sink: + type: clickhouse + properties: + host: "{{ clickhouse_servers }}" + table: tsg_galaxy_v3.transaction_record_local + batch.size: 100000 + batch.interval: 30s + connection.user: e54c9568586180eede1506eecf3574e9 + connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e + + +application: + + env: # [object] Environment Variables + name: etl_transaction_record_kafka_to_clickhouse # [string] Job Name + shade.identifier: aes + pipeline: + object-reuse: true # [boolean] Object Reuse, default is false + topology: + - name: kafka_source + downstream: [etl_processor] + - name: etl_processor + downstream: [clickhouse_sink] + - name: clickhouse_sink diff --git a/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_voip_record_kafka_to_clickhouse b/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_voip_record_kafka_to_clickhouse new file mode 100644 index 0000000..d798e94 --- /dev/null +++ b/tsg_olap/upgrade/TSG-24.06/groot_stream/single-cluster-examples/etl_voip_record_kafka_to_clickhouse @@ -0,0 +1,142 @@ +sources: + kafka_source: + type: kafka + # fields: # [array of object] Field List, if not set, all fields(Map) will be output. + # watermark_timestamp: common_recv_time # [string] Watermark Field Name + # watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms + # watermark_lag: 60 # [number] Watermark Lag, default is 60 + properties: + topic: VOIP-CONVERSATION-RECORD + kafka.bootstrap.servers: "{{ kafka_source_servers }}" + kafka.session.timeout.ms: 60000 + kafka.max.poll.records: 3000 + kafka.max.partition.fetch.bytes: 31457280 + kafka.security.protocol: SASL_PLAINTEXT + kafka.ssl.keystore.location: + kafka.ssl.keystore.password: + kafka.ssl.truststore.location: + kafka.ssl.truststore.password: + kafka.ssl.key.password: + kafka.sasl.mechanism: PLAIN + kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817 + kafka.buffer.memory: + kafka.group.id: etl_voip_record_kafka_to_clickhouse-20231221 + kafka.auto.offset.reset: latest + kafka.max.request.size: + kafka.compression.type: none + format: json + +processing_pipelines: + etl_processor: # [object] Processing Pipeline + type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + remove_fields: + output_fields: + properties: + key: value + functions: # [array of object] Function List + + - function: SNOWFLAKE_ID + lookup_fields: [''] + output_fields: [log_id] + parameters: + data_center_id_num: 1 + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [data_center] + filter: + parameters: + value_expression: $.tags[?(@.tag=='data_center')][0].value + + - function: JSON_EXTRACT + lookup_fields: [device_tag] + output_fields: [device_group] + filter: + parameters: + value_expression: $.tags[?(@.tag=='device_group')][0].value + + - function: CURRENT_UNIX_TIMESTAMP + output_fields: [processing_time] + parameters: + precision: seconds + + - function: UNIX_TIMESTAMP_CONVERTER + lookup_fields: [__timestamp] + output_fields: [recv_time] + parameters: + precision: seconds + + - function: EVAL + output_fields: [ingestion_time] + parameters: + value_expression: recv_time + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_subject] + parameters: + value_field: mail_subject + charset_field: mail_subject_charset + + - function: BASE64_DECODE_TO_STRING + output_fields: [mail_attachment_name] + parameters: + value_field: mail_attachment_name + charset_field: mail_attachment_name_charset + + - function: PATH_COMBINE + lookup_fields: [rtp_pcap_path] + output_fields: [rtp_pcap_path] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path] + + - function: PATH_COMBINE + lookup_fields: [http_request_body] + output_fields: [http_request_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body] + + - function: PATH_COMBINE + lookup_fields: [http_response_body] + output_fields: [http_response_body] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body] + + - function: PATH_COMBINE + lookup_fields: [mail_eml_file] + output_fields: [mail_eml_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file] + + - function: PATH_COMBINE + lookup_fields: [packet_capture_file] + output_fields: [packet_capture_file] + parameters: + path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file] + +sinks: + clickhouse_sink: + type: clickhouse + properties: + host: "{{ clickhouse_servers }}" + table: tsg_galaxy_v3.voip_record_local + batch.size: 100000 + batch.interval: 30s + connection.user: e54c9568586180eede1506eecf3574e9 + connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e + + +application: + + env: # [object] Environment Variables + name: etl_voip_record_kafka_to_clickhouse # [string] Job Name + shade.identifier: aes + pipeline: + object-reuse: true # [boolean] Object Reuse, default is false + topology: + - name: kafka_source + downstream: [etl_processor] + - name: etl_processor + downstream: [clickhouse_sink] + - name: clickhouse_sink + +