diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a01d8e22f..9db5cf46a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ Please refer to the [NEWS](NEWS.md) for a list of changes which have an affect o - New parameter `stop_retry_limit` to gracefully handle stopping bots which take longer to shutdown (PR#2598 by Lukas Heindl, fixes #2595). - `intelmq.lib.datatypes`: Remove unneeded Dict39 alias (PR#2639 by Nakul Rajpal, fixes #2635) - `intelmq.lib.mixins.http`: Only set HTTP header 'Authorization' if username or password are set and are not both empty string as they are by default in the Manager (fixes #2590, PR#2634 by Sebastian Wagner). +- `intelmq.lib.message.Message.from_dict`: Do not modify the dict parameter by adding the `__type` field and raise an error when type is not determinable (PR#2545 by Sebastian Wagner). ### Development @@ -49,6 +50,9 @@ Please refer to the [NEWS](NEWS.md) for a list of changes which have an affect o #### Parsers - `intelmq.bots.parsers.cymru.parser_cap_program`: Add mapping for TOR and ipv6-icmp protocol (PR#2621 by Mikk Margus Möll). - Remove `intelmq.bots.collectors.blueliv` as it is obsolete with the removed collector (PR#2632 by Sebastian Wagner). +- `intelmq.bots.parser.json.parser`: + - Support data containing lists of JSON Events (PR#2545 by Tim de Boer). + - Add default `classification.type` with value `undetermined` if input data has now classification itself (PR#2545 by Sebastian Wagner). #### Experts - `intelmq.bots.experts.asn_lookup.expert`: diff --git a/docs/user/bots.md b/docs/user/bots.md index 7a974e8428..0ea71d44bc 100644 --- a/docs/user/bots.md +++ b/docs/user/bots.md @@ -1925,12 +1925,69 @@ also ). Defaults to `htm --- -### JSON (TODO)
+### JSON
-TODO +Parses JSON events that are already in IntelMQ format. +If the input data did not contain the field `classification.type`, it is set to `undetermined`. + +Supports multiple different modes: + +#### Input data is one event +Example: +```json +{ INTELMQ data... } +``` +or: +``` +{ + INTELMQ data... +} +``` + +Configuration: +* `splitlines`: False +* `multiple_events`: False + +#### Input data is in JSON stream format +Example: +```json +{ INTELMQ data... } +{ INTELMQ data... } +{ INTELMQ data... } +``` + +Configuration: +* `splitlines`: True +* `multiple_events`: False + +#### Input data is a list of events +Example: +```json +[ + { INTELMQ data... }, + { INTELMQ data... }, + ... +] +``` + +Configuration: +* `splitlines`: False +* `multiple_events`: True + +#### Configuration **Module:** `intelmq.bots.parsers.json.parser` +**Parameters:** + +**`splitlines`** + +(optional, boolean) When the input file contains one JSON dictionary per line, set this to `true`. Defaults to `false`. + +**`multiple_events`** + +(optional, string) When the input file contains a JSON list of dictionaries, set this to `true`. Defaults to `false`. + --- ### Key=Value Parser
diff --git a/intelmq/bots/parsers/json/parser.py b/intelmq/bots/parsers/json/parser.py index f66a5a7410..37713e9f99 100644 --- a/intelmq/bots/parsers/json/parser.py +++ b/intelmq/bots/parsers/json/parser.py @@ -1,38 +1,48 @@ -# SPDX-FileCopyrightText: 2016 by Bundesamt für Sicherheit in der Informationstechnik +# SPDX-FileCopyrightText: 2016 by Bundesamt für Sicherheit in der Informationstechnik, 2016-2021 nic.at GmbH, 2024 Tim de Boer, 2025 Institute for Common Good Technology # # SPDX-License-Identifier: AGPL-3.0-or-later """ JSON Parser Bot Retrieves a base64 encoded JSON-String from raw and converts it into an event. - -Copyright (C) 2016 by Bundesamt für Sicherheit in der Informationstechnik -Software engineering by Intevation GmbH """ from intelmq.lib.bot import ParserBot from intelmq.lib.message import MessageFactory from intelmq.lib.utils import base64_decode +from json import loads as json_loads, dumps as json_dumps class JSONParserBot(ParserBot): """Parse IntelMQ-JSON data""" - splitlines = False + splitlines: bool = False + multiple_events: bool = False + + def init(self): + if self.multiple_events and self.splitlines: + raise ValueError("Modes 'splitlines' and 'multiple_events' are not possible at the same time. Please use either one.") def process(self): report = self.receive_message() - if self.splitlines: - lines = base64_decode(report['raw']).splitlines() + if self.multiple_events: + lines = json_loads(base64_decode(report["raw"])) + elif self.splitlines: + lines = base64_decode(report["raw"]).splitlines() else: - lines = [base64_decode(report['raw'])] + lines = [base64_decode(report["raw"])] for line in lines: - new_event = MessageFactory.unserialize(line, - harmonization=self.harmonization, - default_type='Event') event = self.new_event(report) - event.update(new_event) - if 'raw' not in event: - event['raw'] = line + if self.multiple_events: + event.update(MessageFactory.from_dict(line, + harmonization=self.harmonization, + default_type="Event")) + event["raw"] = json_dumps(line, sort_keys=True) + else: + event.update(MessageFactory.unserialize(line, + harmonization=self.harmonization, + default_type="Event")) + event.add('raw', line, overwrite=False) + event.add("classification.type", "undetermined", overwrite=False) # set to undetermined if input has no classification self.send_message(event) self.acknowledge_message() diff --git a/intelmq/lib/message.py b/intelmq/lib/message.py index 627ebbb671..eeff508722 100644 --- a/intelmq/lib/message.py +++ b/intelmq/lib/message.py @@ -49,10 +49,10 @@ def from_dict(message: dict, harmonization=None, MessageFactory.unserialize MessageFactory.serialize """ - if default_type and "__type" not in message: - message["__type"] = default_type + if not default_type and '__type' not in message: + raise ValueError("Message type could not be determined. Input message misses '__type' and parameter 'default_type' not given.") try: - class_reference = getattr(intelmq.lib.message, message["__type"]) + class_reference = getattr(intelmq.lib.message, message.get("__type", default_type)) except AttributeError: raise exceptions.InvalidArgument('__type', got=message["__type"], @@ -60,6 +60,8 @@ def from_dict(message: dict, harmonization=None, docs=HARMONIZATION_CONF_FILE) # don't modify the parameter message_copy = message.copy() + if default_type and "__type" not in message_copy: + message_copy["__type"] = default_type del message_copy["__type"] return class_reference(message_copy, auto=True, harmonization=harmonization) diff --git a/intelmq/tests/bots/parsers/json/ncscnl.json b/intelmq/tests/bots/parsers/json/ncscnl.json new file mode 100644 index 0000000000..b822522f93 --- /dev/null +++ b/intelmq/tests/bots/parsers/json/ncscnl.json @@ -0,0 +1,68 @@ +[ + { + "extra.dataset_collections": "0", + "extra.dataset_files": "1", + "extra.dataset_infected": "false", + "extra.dataset_ransom": "null", + "extra.dataset_rows": "0", + "extra.dataset_size": "301", + "protocol.application": "https", + "protocol.transport": "tcp", + "source.asn": 12345689, + "source.fqdn": "fqdn-example-1.tld", + "source.geolocation.cc": "NL", + "source.geolocation.city": "Enschede", + "source.geolocation.latitude": 52.0000000000000, + "source.geolocation.longitude": 6.0000000000000, + "source.geolocation.region": "Overijssel", + "source.ip": "127.1.2.1", + "source.network": "127.1.0.0/16", + "source.port": 80, + "time.source": "2024-12-16T02:08:06+00:00" + }, + { + "extra.dataset_collections": "0", + "extra.dataset_files": "1", + "extra.dataset_infected": "false", + "extra.dataset_ransom": "null", + "extra.dataset_rows": "0", + "extra.dataset_size": "615", + "extra.os_name": "Ubuntu", + "extra.software": "Apache", + "extra.tag": "rescan", + "extra.version": "2.4.58", + "protocol.application": "https", + "protocol.transport": "tcp", + "source.asn": 12345689, + "source.fqdn": "fqdn-example-2.tld", + "source.geolocation.cc": "NL", + "source.geolocation.city": "Eindhoven", + "source.geolocation.latitude": 51.0000000000000, + "source.geolocation.longitude": 5.0000000000000, + "source.geolocation.region": "North Brabant", + "source.ip": "127.1.2.2", + "source.network": "127.1.0.0/16", + "source.port": 443, + "time.source": "2024-12-16T02:08:12+00:00" + }, + { + "extra.dataset_collections": "0", + "extra.dataset_files": "1", + "extra.dataset_infected": "false", + "extra.dataset_ransom": "null", + "extra.dataset_rows": "0", + "extra.dataset_size": "421", + "protocol.application": "http", + "protocol.transport": "tcp", + "source.asn": 12345689, + "source.geolocation.cc": "NL", + "source.geolocation.city": "Enschede", + "source.geolocation.latitude": 52.0000000000000, + "source.geolocation.longitude": 6.0000000000000, + "source.geolocation.region": "Overijssel", + "source.ip": "127.1.2.3", + "source.network": "127.1.0.0/16", + "source.port": 9000, + "time.source": "2024-12-15T21:09:49+00:00" + } +] diff --git a/intelmq/tests/bots/parsers/json/ncscnl.json.license b/intelmq/tests/bots/parsers/json/ncscnl.json.license new file mode 100644 index 0000000000..14f82d872c --- /dev/null +++ b/intelmq/tests/bots/parsers/json/ncscnl.json.license @@ -0,0 +1,2 @@ +SPDX-FileCopyrightText: 2024 Tim de Boer +SPDX-License-Identifier: AGPL-3.0-or-later diff --git a/intelmq/tests/bots/parsers/json/test_parser.py b/intelmq/tests/bots/parsers/json/test_parser.py index c18d18dd00..2e381e47c0 100644 --- a/intelmq/tests/bots/parsers/json/test_parser.py +++ b/intelmq/tests/bots/parsers/json/test_parser.py @@ -6,6 +6,7 @@ import base64 import os import unittest +from json import loads as json_loads, dumps as json_dumps import intelmq.lib.test as test from intelmq.bots.parsers.json.parser import JSONParserBot @@ -51,6 +52,21 @@ NO_DEFAULT_EVENT = MULTILINE_EVENTS[1].copy() NO_DEFAULT_EVENT['raw'] = base64.b64encode(b'{"source.ip": "127.0.0.2", "classification.type": "c2-server"}\n').decode() +with open(os.path.join(os.path.dirname(__file__), 'ncscnl.json'), 'rb') as fh: + NCSCNL_FILE = fh.read() +NCSCNL_RAW = base64.b64encode(NCSCNL_FILE).decode() +NCSC_EVENTS = json_loads(NCSCNL_FILE) +for i, event in enumerate(NCSC_EVENTS): + NCSC_EVENTS[i]['raw'] = base64.b64encode(json_dumps(event, sort_keys=True).encode()).decode() + NCSC_EVENTS[i]['classification.type'] = 'undetermined' + NCSC_EVENTS[i]['feed.name'] = 'NCSC.NL' + NCSC_EVENTS[i]['__type'] = 'Event' + +NCSCNL_REPORT = {"feed.name": "NCSC.NL", + "raw": NCSCNL_RAW, + "__type": "Report", + } + class TestJSONParserBot(test.BotTestCase, unittest.TestCase): """ @@ -70,8 +86,7 @@ def test_oneline_report(self): def test_multiline_report(self): """ Test if correct Event has been produced. """ self.input_message = MULTILINE_REPORT - self.sysconfig = {"splitlines": True} - self.run_bot() + self.run_bot(parameters={"splitlines": True}) self.assertMessageEqual(0, MULTILINE_EVENTS[0]) self.assertMessageEqual(1, MULTILINE_EVENTS[1]) @@ -81,6 +96,14 @@ def test_default_event(self): self.run_bot() self.assertMessageEqual(0, NO_DEFAULT_EVENT) + def test_ncscnl(self): + """ A file containing a list of events (not per line) """ + self.input_message = NCSCNL_REPORT + self.run_bot(parameters={'multiple_events': True}) + self.assertMessageEqual(0, NCSC_EVENTS[0]) + self.assertMessageEqual(1, NCSC_EVENTS[1]) + self.assertMessageEqual(2, NCSC_EVENTS[2]) + if __name__ == '__main__': # pragma: no cover unittest.main() diff --git a/intelmq/tests/lib/test_bot_library_mode.py b/intelmq/tests/lib/test_bot_library_mode.py index f1a50db7d5..93a977a2f8 100644 --- a/intelmq/tests/lib/test_bot_library_mode.py +++ b/intelmq/tests/lib/test_bot_library_mode.py @@ -32,6 +32,7 @@ "destination.ip": "192.0.43.8", # iana.org. "time.observation": "2015-01-01T00:00:00+00:00", } +EXAMPLE_IP_OUTPUT = MessageFactory.from_dict(EXAMPLE_IP_INPUT, default_type='Event') # adds __type = Event class BrokenInitExpertBot(ExpertBot): @@ -130,7 +131,7 @@ def test_bot_multi_message(): def test_bot_raises_and_second_message(): """ - The first message raises an error and the second message + The first message raises an error and the second message is processed correctly This test is based on an issue where the exception-raising message was not cleared from the internal message store of the Bot/Pipeline instance and thus re-used on the second run """ raises_on_first_run = RaisesOnFirstRunExpertBot('raises', settings=BotLibSettings) @@ -138,7 +139,7 @@ def test_bot_raises_and_second_message(): raises_on_first_run.process_message(EXAMPLE_DATA_URL) queues = raises_on_first_run.process_message(EXAMPLE_IP_INPUT) assert len(queues['output']) == 1 - assertMessageEqual(queues['output'][0], EXAMPLE_IP_INPUT) + assertMessageEqual(queues['output'][0], EXAMPLE_IP_OUTPUT) if __name__ == '__main__': # pragma: no cover