增加具体任务配置

This commit is contained in:
wangkuan
2024-04-10 16:37:04 +08:00
parent 6930eaa502
commit a9d7332acf
17 changed files with 1969 additions and 0 deletions

View File

@@ -0,0 +1,49 @@
sources:
kafka_source:
type: kafka
properties:
topic: DOS-EVENT
kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SASL_PLAINTEXT
kafka.ssl.keystore.location:
kafka.ssl.keystore.password:
kafka.ssl.truststore.location:
kafka.ssl.truststore.password:
kafka.ssl.key.password:
kafka.sasl.mechanism: PLAIN
kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.buffer.memory:
kafka.group.id: dos_event_kafka_to_clickhouse-20231221
kafka.auto.offset.reset: latest
kafka.max.request.size:
kafka.compression.type: none
format: json
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.dos_event_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env:
name: dos_event_kafka_to_clickhouse
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,148 @@
sources:
kafka_source:
type: kafka
# fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
# watermark_timestamp: common_recv_time # [string] Watermark Field Name
# watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms
# watermark_lag: 60 # [number] Watermark Lag, default is 60
properties:
topic: PROXY-EVENT
kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SASL_PLAINTEXT
kafka.ssl.keystore.location:
kafka.ssl.keystore.password:
kafka.ssl.truststore.location:
kafka.ssl.truststore.password:
kafka.ssl.key.password:
kafka.sasl.mechanism: PLAIN
kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.buffer.memory:
kafka.group.id: etl_proxy_event_kafka_to_clickhouse-20231221
kafka.auto.offset.reset: latest
kafka.max.request.size:
kafka.compression.type: none
format: json
processing_pipelines:
etl_processor: # [object] Processing Pipeline
type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl
remove_fields:
output_fields:
properties:
key: value
functions: # [array of object] Function List
- function: SNOWFLAKE_ID
lookup_fields: ['']
output_fields: [log_id]
parameters:
data_center_id_num: 1
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [data_center]
filter:
parameters:
value_expression: $.tags[?(@.tag=='data_center')][0].value
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [device_group]
filter:
parameters:
value_expression: $.tags[?(@.tag=='device_group')][0].value
- function: CURRENT_UNIX_TIMESTAMP
output_fields: [processing_time]
parameters:
precision: seconds
- function: UNIX_TIMESTAMP_CONVERTER
lookup_fields: [__timestamp]
output_fields: [recv_time]
parameters:
precision: seconds
- function: EVAL
output_fields: [ingestion_time]
parameters:
value_expression: recv_time
- function: DOMAIN
lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni]
output_fields: [server_domain]
parameters:
option: FIRST_SIGNIFICANT_SUBDOMAIN
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_subject]
parameters:
value_field: mail_subject
charset_field: mail_subject_charset
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_attachment_name]
parameters:
value_field: mail_attachment_name
charset_field: mail_attachment_name_charset
- function: PATH_COMBINE
lookup_fields: [rtp_pcap_path]
output_fields: [rtp_pcap_path]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path]
- function: PATH_COMBINE
lookup_fields: [http_request_body]
output_fields: [http_request_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body]
- function: PATH_COMBINE
lookup_fields: [http_response_body]
output_fields: [http_response_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body]
- function: PATH_COMBINE
lookup_fields: [mail_eml_file]
output_fields: [mail_eml_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file]
- function: PATH_COMBINE
lookup_fields: [packet_capture_file]
output_fields: [packet_capture_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file]
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.proxy_event_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env: # [object] Environment Variables
name: etl_proxy_event_kafka_to_clickhouse # [string] Job Name
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [etl_processor]
- name: etl_processor
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,148 @@
sources:
kafka_source:
type: kafka
# fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
# watermark_timestamp: common_recv_time # [string] Watermark Field Name
# watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms
# watermark_lag: 60 # [number] Watermark Lag, default is 60
properties:
topic: SESSION-RECORD
kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SASL_PLAINTEXT
kafka.ssl.keystore.location:
kafka.ssl.keystore.password:
kafka.ssl.truststore.location:
kafka.ssl.truststore.password:
kafka.ssl.key.password:
kafka.sasl.mechanism: PLAIN
kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.buffer.memory:
kafka.group.id: etl_session_record_kafka_to_clickhouse-20230125
kafka.auto.offset.reset: latest
kafka.max.request.size:
kafka.compression.type: none
format: json
processing_pipelines:
etl_processor: # [object] Processing Pipeline
type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl
remove_fields:
output_fields:
properties:
key: value
functions: # [array of object] Function List
- function: SNOWFLAKE_ID
lookup_fields: ['']
output_fields: [log_id]
parameters:
data_center_id_num: 1
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [data_center]
filter:
parameters:
value_expression: $.tags[?(@.tag=='data_center')][0].value
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [device_group]
filter:
parameters:
value_expression: $.tags[?(@.tag=='device_group')][0].value
- function: CURRENT_UNIX_TIMESTAMP
output_fields: [processing_time]
parameters:
precision: seconds
- function: UNIX_TIMESTAMP_CONVERTER
lookup_fields: [__timestamp]
output_fields: [recv_time]
parameters:
precision: seconds
- function: EVAL
output_fields: [ingestion_time]
parameters:
value_expression: recv_time
- function: DOMAIN
lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni]
output_fields: [server_domain]
parameters:
option: FIRST_SIGNIFICANT_SUBDOMAIN
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_subject]
parameters:
value_field: mail_subject
charset_field: mail_subject_charset
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_attachment_name]
parameters:
value_field: mail_attachment_name
charset_field: mail_attachment_name_charset
- function: PATH_COMBINE
lookup_fields: [rtp_pcap_path]
output_fields: [rtp_pcap_path]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path]
- function: PATH_COMBINE
lookup_fields: [http_request_body]
output_fields: [http_request_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body]
- function: PATH_COMBINE
lookup_fields: [http_response_body]
output_fields: [http_response_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body]
- function: PATH_COMBINE
lookup_fields: [mail_eml_file]
output_fields: [mail_eml_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file]
- function: PATH_COMBINE
lookup_fields: [packet_capture_file]
output_fields: [packet_capture_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file]
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.session_record_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env: # [object] Environment Variables
name: etl_session_record_kafka_to_clickhouse # [string] Job Name
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [etl_processor]
- name: etl_processor
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,146 @@
sources:
kafka_source:
type: kafka
# fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
# watermark_timestamp: common_recv_time # [string] Watermark Field Name
# watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms
# watermark_lag: 60 # [number] Watermark Lag, default is 60
properties:
topic: TRANSACTION-RECORD
kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SASL_PLAINTEXT
kafka.ssl.keystore.location:
kafka.ssl.keystore.password:
kafka.ssl.truststore.location:
kafka.ssl.truststore.password:
kafka.ssl.key.password:
kafka.sasl.mechanism: PLAIN
kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.buffer.memory:
kafka.group.id: etl_transaction_record_kafka_to_clickhouse-20240308
kafka.auto.offset.reset: latest
kafka.max.request.size:
kafka.compression.type: none
format: json
processing_pipelines:
etl_processor: # [object] Processing Pipeline
type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl
remove_fields:
output_fields:
properties:
key: value
functions: # [array of object] Function List
- function: SNOWFLAKE_ID
lookup_fields: ['']
output_fields: [log_id]
parameters:
data_center_id_num: 1
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [data_center]
filter:
parameters:
value_expression: $.tags[?(@.tag=='data_center')][0].value
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [device_group]
filter:
parameters:
value_expression: $.tags[?(@.tag=='device_group')][0].value
- function: CURRENT_UNIX_TIMESTAMP
output_fields: [processing_time]
parameters:
precision: seconds
- function: UNIX_TIMESTAMP_CONVERTER
lookup_fields: [__timestamp]
output_fields: [recv_time]
parameters:
precision: seconds
- function: EVAL
output_fields: [ingestion_time]
parameters:
value_expression: recv_time
- function: DOMAIN
lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni]
output_fields: [server_domain]
parameters:
option: FIRST_SIGNIFICANT_SUBDOMAIN
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_subject]
parameters:
value_field: mail_subject
charset_field: mail_subject_charset
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_attachment_name]
parameters:
value_field: mail_attachment_name
charset_field: mail_attachment_name_charset
- function: PATH_COMBINE
lookup_fields: [rtp_pcap_path]
output_fields: [rtp_pcap_path]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path]
- function: PATH_COMBINE
lookup_fields: [http_request_body]
output_fields: [http_request_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body]
- function: PATH_COMBINE
lookup_fields: [http_response_body]
output_fields: [http_response_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body]
- function: PATH_COMBINE
lookup_fields: [mail_eml_file]
output_fields: [mail_eml_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file]
- function: PATH_COMBINE
lookup_fields: [packet_capture_file]
output_fields: [packet_capture_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file]
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.transaction_record_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env: # [object] Environment Variables
name: etl_transaction_record_kafka_to_clickhouse # [string] Job Name
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [etl_processor]
- name: etl_processor
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,148 @@
sources:
kafka_source:
type: kafka
# fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
# watermark_timestamp: common_recv_time # [string] Watermark Field Name
# watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms
# watermark_lag: 60 # [number] Watermark Lag, default is 60
properties:
topic: VOIP-CONVERSATION-RECORD
kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SASL_PLAINTEXT
kafka.ssl.keystore.location:
kafka.ssl.keystore.password:
kafka.ssl.truststore.location:
kafka.ssl.truststore.password:
kafka.ssl.key.password:
kafka.sasl.mechanism: PLAIN
kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.buffer.memory:
kafka.group.id: etl_voip_record_kafka_to_clickhouse-20231221
kafka.auto.offset.reset: latest
kafka.max.request.size:
kafka.compression.type: none
format: json
processing_pipelines:
etl_processor: # [object] Processing Pipeline
type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl
remove_fields:
output_fields:
properties:
key: value
functions: # [array of object] Function List
- function: SNOWFLAKE_ID
lookup_fields: ['']
output_fields: [log_id]
parameters:
data_center_id_num: 1
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [data_center]
filter:
parameters:
value_expression: $.tags[?(@.tag=='data_center')][0].value
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [device_group]
filter:
parameters:
value_expression: $.tags[?(@.tag=='device_group')][0].value
- function: CURRENT_UNIX_TIMESTAMP
output_fields: [processing_time]
parameters:
precision: seconds
- function: UNIX_TIMESTAMP_CONVERTER
lookup_fields: [__timestamp]
output_fields: [recv_time]
parameters:
precision: seconds
- function: EVAL
output_fields: [ingestion_time]
parameters:
value_expression: recv_time
- function: DOMAIN
lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni]
output_fields: [server_domain]
parameters:
option: FIRST_SIGNIFICANT_SUBDOMAIN
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_subject]
parameters:
value_field: mail_subject
charset_field: mail_subject_charset
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_attachment_name]
parameters:
value_field: mail_attachment_name
charset_field: mail_attachment_name_charset
- function: PATH_COMBINE
lookup_fields: [rtp_pcap_path]
output_fields: [rtp_pcap_path]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path]
- function: PATH_COMBINE
lookup_fields: [http_request_body]
output_fields: [http_request_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body]
- function: PATH_COMBINE
lookup_fields: [http_response_body]
output_fields: [http_response_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body]
- function: PATH_COMBINE
lookup_fields: [mail_eml_file]
output_fields: [mail_eml_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file]
- function: PATH_COMBINE
lookup_fields: [packet_capture_file]
output_fields: [packet_capture_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file]
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.voip_record_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env: # [object] Environment Variables
name: etl_voip_record_kafka_to_clickhouse # [string] Job Name
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [etl_processor]
- name: etl_processor
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,49 @@
sources:
kafka_source:
type: kafka
# fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
properties:
topic: SESSION-RECORD
kafka.bootstrap.servers: "{{ kafka_source_servers }}"
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SSL
kafka.ssl.endpoint.identification.algorithm: ""
kafka.ssl.keystore.location: /data/tsg/olap/flink/topology/data/keystore.jks
kafka.ssl.keystore.password: 86cf0e2ffba3f541a6c6761313e5cc7e
kafka.ssl.truststore.location: /data/tsg/olap/flink/topology/data/truststore.jks
kafka.ssl.truststore.password: 86cf0e2ffba3f541a6c6761313e5cc7e
kafka.ssl.key.password: 86cf0e2ffba3f541a6c6761313e5cc7e
#kafka.security.protocol: SASL_PLAINTEXT
#kafka.sasl.mechanism: PLAIN
#kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.group.id: etl_session_record_kafka_to_kafka-20231221
kafka.auto.offset.reset: latest
format: json
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.dos_event_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env:
name: dos_event_kafka_to_clickhouse
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,157 @@
sources:
kafka_source:
type: kafka
# fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
properties:
topic: SESSION-RECORD
kafka.bootstrap.servers: "{{ kafka_source_servers }}"
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SSL
kafka.ssl.endpoint.identification.algorithm: ""
kafka.ssl.keystore.location: /data/tsg/olap/flink/topology/data/keystore.jks
kafka.ssl.keystore.password: 86cf0e2ffba3f541a6c6761313e5cc7e
kafka.ssl.truststore.location: /data/tsg/olap/flink/topology/data/truststore.jks
kafka.ssl.truststore.password: 86cf0e2ffba3f541a6c6761313e5cc7e
kafka.ssl.key.password: 86cf0e2ffba3f541a6c6761313e5cc7e
#kafka.security.protocol: SASL_PLAINTEXT
#kafka.sasl.mechanism: PLAIN
#kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.group.id: etl_session_record_kafka_to_kafka-20231221
kafka.auto.offset.reset: latest
format: json
processing_pipelines:
etl_processor: # [object] Processing Pipeline
type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl
remove_fields:
output_fields:
properties:
key: value
functions: # [array of object] Function List
- function: ASN_LOOKUP
lookup_fields: [server_ip]
output_fields: [server_asn]
parameters:
option: IP_TO_ASN
kb_name: tsg_ip_asn
- function: ASN_LOOKUP
lookup_fields: [client_ip]
output_fields: [client_asn]
parameters:
option: IP_TO_ASN
kb_name: tsg_ip_asn
- function: SNOWFLAKE_ID
lookup_fields: ['']
output_fields: [log_id]
parameters:
data_center_id_num: 1
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [data_center]
filter:
parameters:
value_expression: $.tags[?(@.tag=='data_center')][0].value
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [device_group]
filter:
parameters:
value_expression: $.tags[?(@.tag=='device_group')][0].value
- function: CURRENT_UNIX_TIMESTAMP
output_fields: [processing_time]
parameters:
precision: seconds
- function: UNIX_TIMESTAMP_CONVERTER
lookup_fields: [__timestamp]
output_fields: [recv_time]
parameters:
precision: seconds
- function: EVAL
output_fields: [ingestion_time]
parameters:
value_expression: recv_time
- function: DOMAIN
lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni]
output_fields: [server_domain]
parameters:
option: FIRST_SIGNIFICANT_SUBDOMAIN
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_subject]
parameters:
value_field: mail_subject
charset_field: mail_subject_charset
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_attachment_name]
parameters:
value_field: mail_attachment_name
charset_field: mail_attachment_name_charset
- function: PATH_COMBINE
lookup_fields: [rtp_pcap_path]
output_fields: [rtp_pcap_path]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path]
- function: PATH_COMBINE
lookup_fields: [http_request_body]
output_fields: [http_request_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body]
- function: PATH_COMBINE
lookup_fields: [http_response_body]
output_fields: [http_response_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body]
- function: PATH_COMBINE
lookup_fields: [mail_eml_file]
output_fields: [mail_eml_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file]
- function: PATH_COMBINE
lookup_fields: [packet_capture_file]
output_fields: [packet_capture_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file]
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.proxy_event_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env: # [object] Environment Variables
name: etl_proxy_event_kafka_to_clickhouse # [string] Job Name
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [etl_processor]
- name: etl_processor
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,161 @@
sources:
kafka_source:
type: kafka
# fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
# watermark_timestamp: common_recv_time # [string] Watermark Field Name
# watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms
# watermark_lag: 60 # [number] Watermark Lag, default is 60
properties:
topic: SESSION-RECORD
kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SASL_PLAINTEXT
kafka.ssl.keystore.location:
kafka.ssl.keystore.password:
kafka.ssl.truststore.location:
kafka.ssl.truststore.password:
kafka.ssl.key.password:
kafka.sasl.mechanism: PLAIN
kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.buffer.memory:
kafka.group.id: etl_session_record_kafka_to_clickhouse-20230125
kafka.auto.offset.reset: latest
kafka.max.request.size:
kafka.compression.type: none
format: json
processing_pipelines:
etl_processor: # [object] Processing Pipeline
type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl
remove_fields:
output_fields:
properties:
key: value
functions: # [array of object] Function List
- function: ASN_LOOKUP
lookup_fields: [server_ip]
output_fields: [server_asn]
parameters:
option: IP_TO_ASN
kb_name: tsg_ip_asn
- function: ASN_LOOKUP
lookup_fields: [client_ip]
output_fields: [client_asn]
parameters:
option: IP_TO_ASN
kb_name: tsg_ip_asn
- function: SNOWFLAKE_ID
lookup_fields: ['']
output_fields: [log_id]
parameters:
data_center_id_num: 1
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [data_center]
filter:
parameters:
value_expression: $.tags[?(@.tag=='data_center')][0].value
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [device_group]
filter:
parameters:
value_expression: $.tags[?(@.tag=='device_group')][0].value
- function: CURRENT_UNIX_TIMESTAMP
output_fields: [processing_time]
parameters:
precision: seconds
- function: UNIX_TIMESTAMP_CONVERTER
lookup_fields: [__timestamp]
output_fields: [recv_time]
parameters:
precision: seconds
- function: EVAL
output_fields: [ingestion_time]
parameters:
value_expression: recv_time
- function: DOMAIN
lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni]
output_fields: [server_domain]
parameters:
option: FIRST_SIGNIFICANT_SUBDOMAIN
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_subject]
parameters:
value_field: mail_subject
charset_field: mail_subject_charset
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_attachment_name]
parameters:
value_field: mail_attachment_name
charset_field: mail_attachment_name_charset
- function: PATH_COMBINE
lookup_fields: [rtp_pcap_path]
output_fields: [rtp_pcap_path]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path]
- function: PATH_COMBINE
lookup_fields: [http_request_body]
output_fields: [http_request_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body]
- function: PATH_COMBINE
lookup_fields: [http_response_body]
output_fields: [http_response_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body]
- function: PATH_COMBINE
lookup_fields: [mail_eml_file]
output_fields: [mail_eml_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file]
- function: PATH_COMBINE
lookup_fields: [packet_capture_file]
output_fields: [packet_capture_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file]
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.session_record_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env: # [object] Environment Variables
name: etl_session_record_kafka_to_clickhouse # [string] Job Name
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [etl_processor]
- name: etl_processor
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,155 @@
sources:
kafka_source:
type: kafka
# fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
properties:
topic: SESSION-RECORD
kafka.bootstrap.servers: "{{ kafka_source_servers }}"
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SSL
kafka.ssl.endpoint.identification.algorithm: ""
kafka.ssl.keystore.location: /data/tsg/olap/flink/topology/data/keystore.jks
kafka.ssl.keystore.password: 86cf0e2ffba3f541a6c6761313e5cc7e
kafka.ssl.truststore.location: /data/tsg/olap/flink/topology/data/truststore.jks
kafka.ssl.truststore.password: 86cf0e2ffba3f541a6c6761313e5cc7e
kafka.ssl.key.password: 86cf0e2ffba3f541a6c6761313e5cc7e
#kafka.security.protocol: SASL_PLAINTEXT
#kafka.sasl.mechanism: PLAIN
#kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.group.id: etl_session_record_kafka_to_kafka-20231221
kafka.auto.offset.reset: latest
format: json
processing_pipelines:
etl_processor: # [object] Processing Pipeline
type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl
remove_fields:
output_fields:
properties:
key: value
functions: # [array of object] Function List
- function: ASN_LOOKUP
lookup_fields: [server_ip]
output_fields: [server_asn]
parameters:
option: IP_TO_ASN
kb_name: tsg_ip_asn
- function: ASN_LOOKUP
lookup_fields: [client_ip]
output_fields: [client_asn]
parameters:
option: IP_TO_ASN
kb_name: tsg_ip_asn
- function: SNOWFLAKE_ID
lookup_fields: ['']
output_fields: [log_id]
parameters:
data_center_id_num: 1
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [data_center]
filter:
parameters:
value_expression: $.tags[?(@.tag=='data_center')][0].value
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [device_group]
filter:
parameters:
value_expression: $.tags[?(@.tag=='device_group')][0].value
- function: CURRENT_UNIX_TIMESTAMP
output_fields: [processing_time]
parameters:
precision: seconds
- function: UNIX_TIMESTAMP_CONVERTER
lookup_fields: [__timestamp]
output_fields: [recv_time]
parameters:
precision: seconds
- function: EVAL
output_fields: [ingestion_time]
parameters:
value_expression: recv_time
- function: DOMAIN
lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni]
output_fields: [server_domain]
parameters:
option: FIRST_SIGNIFICANT_SUBDOMAIN
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_subject]
parameters:
value_field: mail_subject
charset_field: mail_subject_charset
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_attachment_name]
parameters:
value_field: mail_attachment_name
charset_field: mail_attachment_name_charset
- function: PATH_COMBINE
lookup_fields: [rtp_pcap_path]
output_fields: [rtp_pcap_path]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path]
- function: PATH_COMBINE
lookup_fields: [http_request_body]
output_fields: [http_request_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body]
- function: PATH_COMBINE
lookup_fields: [http_response_body]
output_fields: [http_response_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body]
- function: PATH_COMBINE
lookup_fields: [mail_eml_file]
output_fields: [mail_eml_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file]
- function: PATH_COMBINE
lookup_fields: [packet_capture_file]
output_fields: [packet_capture_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file]
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.transaction_record_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env: # [object] Environment Variables
name: etl_transaction_record_kafka_to_clickhouse # [string] Job Name
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [etl_processor]
- name: etl_processor
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,156 @@
sources:
kafka_source:
type: kafka
# fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
properties:
topic: SESSION-RECORD
kafka.bootstrap.servers: "{{ kafka_source_servers }}"
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SSL
kafka.ssl.endpoint.identification.algorithm: ""
kafka.ssl.keystore.location: /data/tsg/olap/flink/topology/data/keystore.jks
kafka.ssl.keystore.password: 86cf0e2ffba3f541a6c6761313e5cc7e
kafka.ssl.truststore.location: /data/tsg/olap/flink/topology/data/truststore.jks
kafka.ssl.truststore.password: 86cf0e2ffba3f541a6c6761313e5cc7e
kafka.ssl.key.password: 86cf0e2ffba3f541a6c6761313e5cc7e
#kafka.security.protocol: SASL_PLAINTEXT
#kafka.sasl.mechanism: PLAIN
#kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.group.id: etl_session_record_kafka_to_kafka-20231221
kafka.auto.offset.reset: latest
format: json
processing_pipelines:
etl_processor: # [object] Processing Pipeline
type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl
remove_fields:
output_fields:
functions: # [array of object] Function List
- function: ASN_LOOKUP
lookup_fields: [server_ip]
output_fields: [server_asn]
parameters:
option: IP_TO_ASN
kb_name: tsg_ip_asn
- function: ASN_LOOKUP
lookup_fields: [client_ip]
output_fields: [client_asn]
parameters:
option: IP_TO_ASN
kb_name: tsg_ip_asn
- function: SNOWFLAKE_ID
lookup_fields: ['']
output_fields: [log_id]
parameters:
data_center_id_num: 1
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [data_center]
filter:
parameters:
value_expression: $.tags[?(@.tag=='data_center')][0].value
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [device_group]
filter:
parameters:
value_expression: $.tags[?(@.tag=='device_group')][0].value
- function: CURRENT_UNIX_TIMESTAMP
output_fields: [processing_time]
parameters:
precision: seconds
- function: UNIX_TIMESTAMP_CONVERTER
lookup_fields: [__timestamp]
output_fields: [recv_time]
parameters:
precision: seconds
- function: EVAL
output_fields: [ingestion_time]
parameters:
value_expression: recv_time
- function: DOMAIN
lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni]
output_fields: [server_domain]
parameters:
option: FIRST_SIGNIFICANT_SUBDOMAIN
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_subject]
parameters:
value_field: mail_subject
charset_field: mail_subject_charset
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_attachment_name]
parameters:
value_field: mail_attachment_name
charset_field: mail_attachment_name_charset
- function: PATH_COMBINE
lookup_fields: [rtp_pcap_path]
output_fields: [rtp_pcap_path]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path]
- function: PATH_COMBINE
lookup_fields: [http_request_body]
output_fields: [http_request_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body]
- function: PATH_COMBINE
lookup_fields: [http_response_body]
output_fields: [http_response_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body]
- function: PATH_COMBINE
lookup_fields: [mail_eml_file]
output_fields: [mail_eml_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file]
- function: PATH_COMBINE
lookup_fields: [packet_capture_file]
output_fields: [packet_capture_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file]
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.voip_record_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env: # [object] Environment Variables
name: etl_voip_record_kafka_to_clickhouse # [string] Job Name
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [etl_processor]
- name: etl_processor
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,7 @@
v1.2.2 (2024-04-08)
https://git.mesalab.cn/galaxy/platform/groot-stream/-/releases/v1.2.2
Core
配置文件敏感信息加密AES Config Shade增加选项kafka.ssl.keystore.passwordkafka.ssl.truststore.passwordkafka.ssl.key.password
Connector
ClickHouse Sink Connector兼容AggregateFunction类型

View File

@@ -0,0 +1,6 @@
grootstream:
properties:
hos.path: http://192.168.44.12:9098/hos
hos.bucket.name.traffic_file: traffic_file_bucket
hos.bucket.name.troubleshooting_file: troubleshooting_file_bucket
scheduler.knowledge_base.update.interval.minutes: 5

View File

@@ -0,0 +1,49 @@
sources:
kafka_source:
type: kafka
properties:
topic: DOS-EVENT
kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SASL_PLAINTEXT
kafka.ssl.keystore.location:
kafka.ssl.keystore.password:
kafka.ssl.truststore.location:
kafka.ssl.truststore.password:
kafka.ssl.key.password:
kafka.sasl.mechanism: PLAIN
kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.buffer.memory:
kafka.group.id: dos_event_kafka_to_clickhouse-20231221
kafka.auto.offset.reset: latest
kafka.max.request.size:
kafka.compression.type: none
format: json
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.dos_event_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env:
name: dos_event_kafka_to_clickhouse
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,148 @@
sources:
kafka_source:
type: kafka
# fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
# watermark_timestamp: common_recv_time # [string] Watermark Field Name
# watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms
# watermark_lag: 60 # [number] Watermark Lag, default is 60
properties:
topic: PROXY-EVENT
kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SASL_PLAINTEXT
kafka.ssl.keystore.location:
kafka.ssl.keystore.password:
kafka.ssl.truststore.location:
kafka.ssl.truststore.password:
kafka.ssl.key.password:
kafka.sasl.mechanism: PLAIN
kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.buffer.memory:
kafka.group.id: etl_proxy_event_kafka_to_clickhouse-20231221
kafka.auto.offset.reset: latest
kafka.max.request.size:
kafka.compression.type: none
format: json
processing_pipelines:
etl_processor: # [object] Processing Pipeline
type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl
remove_fields:
output_fields:
properties:
key: value
functions: # [array of object] Function List
- function: SNOWFLAKE_ID
lookup_fields: ['']
output_fields: [log_id]
parameters:
data_center_id_num: 1
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [data_center]
filter:
parameters:
value_expression: $.tags[?(@.tag=='data_center')][0].value
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [device_group]
filter:
parameters:
value_expression: $.tags[?(@.tag=='device_group')][0].value
- function: CURRENT_UNIX_TIMESTAMP
output_fields: [processing_time]
parameters:
precision: seconds
- function: UNIX_TIMESTAMP_CONVERTER
lookup_fields: [__timestamp]
output_fields: [recv_time]
parameters:
precision: seconds
- function: EVAL
output_fields: [ingestion_time]
parameters:
value_expression: recv_time
- function: DOMAIN
lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni]
output_fields: [server_domain]
parameters:
option: FIRST_SIGNIFICANT_SUBDOMAIN
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_subject]
parameters:
value_field: mail_subject
charset_field: mail_subject_charset
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_attachment_name]
parameters:
value_field: mail_attachment_name
charset_field: mail_attachment_name_charset
- function: PATH_COMBINE
lookup_fields: [rtp_pcap_path]
output_fields: [rtp_pcap_path]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path]
- function: PATH_COMBINE
lookup_fields: [http_request_body]
output_fields: [http_request_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body]
- function: PATH_COMBINE
lookup_fields: [http_response_body]
output_fields: [http_response_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body]
- function: PATH_COMBINE
lookup_fields: [mail_eml_file]
output_fields: [mail_eml_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file]
- function: PATH_COMBINE
lookup_fields: [packet_capture_file]
output_fields: [packet_capture_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file]
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.proxy_event_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env: # [object] Environment Variables
name: etl_proxy_event_kafka_to_clickhouse # [string] Job Name
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [etl_processor]
- name: etl_processor
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,148 @@
sources:
kafka_source:
type: kafka
# fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
# watermark_timestamp: common_recv_time # [string] Watermark Field Name
# watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms
# watermark_lag: 60 # [number] Watermark Lag, default is 60
properties:
topic: SESSION-RECORD
kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SASL_PLAINTEXT
kafka.ssl.keystore.location:
kafka.ssl.keystore.password:
kafka.ssl.truststore.location:
kafka.ssl.truststore.password:
kafka.ssl.key.password:
kafka.sasl.mechanism: PLAIN
kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.buffer.memory:
kafka.group.id: etl_session_record_kafka_to_clickhouse-20230125
kafka.auto.offset.reset: latest
kafka.max.request.size:
kafka.compression.type: none
format: json
processing_pipelines:
etl_processor: # [object] Processing Pipeline
type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl
remove_fields:
output_fields:
properties:
key: value
functions: # [array of object] Function List
- function: SNOWFLAKE_ID
lookup_fields: ['']
output_fields: [log_id]
parameters:
data_center_id_num: 1
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [data_center]
filter:
parameters:
value_expression: $.tags[?(@.tag=='data_center')][0].value
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [device_group]
filter:
parameters:
value_expression: $.tags[?(@.tag=='device_group')][0].value
- function: CURRENT_UNIX_TIMESTAMP
output_fields: [processing_time]
parameters:
precision: seconds
- function: UNIX_TIMESTAMP_CONVERTER
lookup_fields: [__timestamp]
output_fields: [recv_time]
parameters:
precision: seconds
- function: EVAL
output_fields: [ingestion_time]
parameters:
value_expression: recv_time
- function: DOMAIN
lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni]
output_fields: [server_domain]
parameters:
option: FIRST_SIGNIFICANT_SUBDOMAIN
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_subject]
parameters:
value_field: mail_subject
charset_field: mail_subject_charset
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_attachment_name]
parameters:
value_field: mail_attachment_name
charset_field: mail_attachment_name_charset
- function: PATH_COMBINE
lookup_fields: [rtp_pcap_path]
output_fields: [rtp_pcap_path]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path]
- function: PATH_COMBINE
lookup_fields: [http_request_body]
output_fields: [http_request_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body]
- function: PATH_COMBINE
lookup_fields: [http_response_body]
output_fields: [http_response_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body]
- function: PATH_COMBINE
lookup_fields: [mail_eml_file]
output_fields: [mail_eml_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file]
- function: PATH_COMBINE
lookup_fields: [packet_capture_file]
output_fields: [packet_capture_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file]
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.session_record_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env: # [object] Environment Variables
name: etl_session_record_kafka_to_clickhouse # [string] Job Name
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [etl_processor]
- name: etl_processor
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,146 @@
sources:
kafka_source:
type: kafka
# fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
# watermark_timestamp: common_recv_time # [string] Watermark Field Name
# watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms
# watermark_lag: 60 # [number] Watermark Lag, default is 60
properties:
topic: TRANSACTION-RECORD
kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SASL_PLAINTEXT
kafka.ssl.keystore.location:
kafka.ssl.keystore.password:
kafka.ssl.truststore.location:
kafka.ssl.truststore.password:
kafka.ssl.key.password:
kafka.sasl.mechanism: PLAIN
kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.buffer.memory:
kafka.group.id: etl_transaction_record_kafka_to_clickhouse-20240308
kafka.auto.offset.reset: latest
kafka.max.request.size:
kafka.compression.type: none
format: json
processing_pipelines:
etl_processor: # [object] Processing Pipeline
type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl
remove_fields:
output_fields:
properties:
key: value
functions: # [array of object] Function List
- function: SNOWFLAKE_ID
lookup_fields: ['']
output_fields: [log_id]
parameters:
data_center_id_num: 1
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [data_center]
filter:
parameters:
value_expression: $.tags[?(@.tag=='data_center')][0].value
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [device_group]
filter:
parameters:
value_expression: $.tags[?(@.tag=='device_group')][0].value
- function: CURRENT_UNIX_TIMESTAMP
output_fields: [processing_time]
parameters:
precision: seconds
- function: UNIX_TIMESTAMP_CONVERTER
lookup_fields: [__timestamp]
output_fields: [recv_time]
parameters:
precision: seconds
- function: EVAL
output_fields: [ingestion_time]
parameters:
value_expression: recv_time
- function: DOMAIN
lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni]
output_fields: [server_domain]
parameters:
option: FIRST_SIGNIFICANT_SUBDOMAIN
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_subject]
parameters:
value_field: mail_subject
charset_field: mail_subject_charset
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_attachment_name]
parameters:
value_field: mail_attachment_name
charset_field: mail_attachment_name_charset
- function: PATH_COMBINE
lookup_fields: [rtp_pcap_path]
output_fields: [rtp_pcap_path]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path]
- function: PATH_COMBINE
lookup_fields: [http_request_body]
output_fields: [http_request_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body]
- function: PATH_COMBINE
lookup_fields: [http_response_body]
output_fields: [http_response_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body]
- function: PATH_COMBINE
lookup_fields: [mail_eml_file]
output_fields: [mail_eml_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file]
- function: PATH_COMBINE
lookup_fields: [packet_capture_file]
output_fields: [packet_capture_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file]
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.transaction_record_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env: # [object] Environment Variables
name: etl_transaction_record_kafka_to_clickhouse # [string] Job Name
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [etl_processor]
- name: etl_processor
downstream: [clickhouse_sink]
- name: clickhouse_sink

View File

@@ -0,0 +1,148 @@
sources:
kafka_source:
type: kafka
# fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
# watermark_timestamp: common_recv_time # [string] Watermark Field Name
# watermark_timestamp_unit: ms # [string] Watermark Unit, default is ms
# watermark_lag: 60 # [number] Watermark Lag, default is 60
properties:
topic: VOIP-CONVERSATION-RECORD
kafka.bootstrap.servers: 192.168.44.11:9094,192.168.44.13:9094,192.168.44.14:9094,192.168.44.15:9094,192.168.44.16:9094
kafka.session.timeout.ms: 60000
kafka.max.poll.records: 3000
kafka.max.partition.fetch.bytes: 31457280
kafka.security.protocol: SASL_PLAINTEXT
kafka.ssl.keystore.location:
kafka.ssl.keystore.password:
kafka.ssl.truststore.location:
kafka.ssl.truststore.password:
kafka.ssl.key.password:
kafka.sasl.mechanism: PLAIN
kafka.sasl.jaas.config: 454f65ea6eef1256e3067104f82730e737b68959560966b811e7ff364116b03124917eb2b0f3596f14733aa29ebad9352644ce1a5c85991c6f01ba8a5e8f177a7ff0b2d3889a424249967b3870b50993d9644f239f0de82cdb13bdb502959e16afadffa49ef1e1d2b9c9b5113e619817
kafka.buffer.memory:
kafka.group.id: etl_voip_record_kafka_to_clickhouse-20231221
kafka.auto.offset.reset: latest
kafka.max.request.size:
kafka.compression.type: none
format: json
processing_pipelines:
etl_processor: # [object] Processing Pipeline
type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl
remove_fields:
output_fields:
properties:
key: value
functions: # [array of object] Function List
- function: SNOWFLAKE_ID
lookup_fields: ['']
output_fields: [log_id]
parameters:
data_center_id_num: 1
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [data_center]
filter:
parameters:
value_expression: $.tags[?(@.tag=='data_center')][0].value
- function: JSON_EXTRACT
lookup_fields: [device_tag]
output_fields: [device_group]
filter:
parameters:
value_expression: $.tags[?(@.tag=='device_group')][0].value
- function: CURRENT_UNIX_TIMESTAMP
output_fields: [processing_time]
parameters:
precision: seconds
- function: UNIX_TIMESTAMP_CONVERTER
lookup_fields: [__timestamp]
output_fields: [recv_time]
parameters:
precision: seconds
- function: EVAL
output_fields: [ingestion_time]
parameters:
value_expression: recv_time
- function: DOMAIN
lookup_fields: [http_host, ssl_sni, dtls_sni, quic_sni]
output_fields: [server_domain]
parameters:
option: FIRST_SIGNIFICANT_SUBDOMAIN
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_subject]
parameters:
value_field: mail_subject
charset_field: mail_subject_charset
- function: BASE64_DECODE_TO_STRING
output_fields: [mail_attachment_name]
parameters:
value_field: mail_attachment_name
charset_field: mail_attachment_name_charset
- function: PATH_COMBINE
lookup_fields: [rtp_pcap_path]
output_fields: [rtp_pcap_path]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, rtp_pcap_path]
- function: PATH_COMBINE
lookup_fields: [http_request_body]
output_fields: [http_request_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_request_body]
- function: PATH_COMBINE
lookup_fields: [http_response_body]
output_fields: [http_response_body]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, http_response_body]
- function: PATH_COMBINE
lookup_fields: [mail_eml_file]
output_fields: [mail_eml_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, mail_eml_file]
- function: PATH_COMBINE
lookup_fields: [packet_capture_file]
output_fields: [packet_capture_file]
parameters:
path: [props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file]
sinks:
clickhouse_sink:
type: clickhouse
properties:
host: 192.168.44.13:9001,192.168.44.14:9001,192.168.44.15:9001,192.168.44.16:9001
table: tsg_galaxy_v3.voip_record_local
batch.size: 100000
batch.interval: 30s
connection.user: e54c9568586180eede1506eecf3574e9
connection.password: 86cf0e2ffba3f541a6c6761313e5cc7e
application:
env: # [object] Environment Variables
name: etl_voip_record_kafka_to_clickhouse # [string] Job Name
shade.identifier: aes
pipeline:
object-reuse: true # [boolean] Object Reuse, default is false
topology:
- name: kafka_source
downstream: [etl_processor]
- name: etl_processor
downstream: [clickhouse_sink]
- name: clickhouse_sink