diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3a01d8e22f..9db5cf46a1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@ Please refer to the [NEWS](NEWS.md) for a list of changes which have an affect o
- New parameter `stop_retry_limit` to gracefully handle stopping bots which take longer to shutdown (PR#2598 by Lukas Heindl, fixes #2595).
- `intelmq.lib.datatypes`: Remove unneeded Dict39 alias (PR#2639 by Nakul Rajpal, fixes #2635)
- `intelmq.lib.mixins.http`: Only set HTTP header 'Authorization' if username or password are set and are not both empty string as they are by default in the Manager (fixes #2590, PR#2634 by Sebastian Wagner).
+- `intelmq.lib.message.Message.from_dict`: Do not modify the dict parameter by adding the `__type` field and raise an error when type is not determinable (PR#2545 by Sebastian Wagner).
### Development
@@ -49,6 +50,9 @@ Please refer to the [NEWS](NEWS.md) for a list of changes which have an affect o
#### Parsers
- `intelmq.bots.parsers.cymru.parser_cap_program`: Add mapping for TOR and ipv6-icmp protocol (PR#2621 by Mikk Margus Möll).
- Remove `intelmq.bots.collectors.blueliv` as it is obsolete with the removed collector (PR#2632 by Sebastian Wagner).
+- `intelmq.bots.parser.json.parser`:
+ - Support data containing lists of JSON Events (PR#2545 by Tim de Boer).
+ - Add default `classification.type` with value `undetermined` if input data has now classification itself (PR#2545 by Sebastian Wagner).
#### Experts
- `intelmq.bots.experts.asn_lookup.expert`:
diff --git a/docs/user/bots.md b/docs/user/bots.md
index 7a974e8428..0ea71d44bc 100644
--- a/docs/user/bots.md
+++ b/docs/user/bots.md
@@ -1925,12 +1925,69 @@ also ). Defaults to `htm
---
-### JSON (TODO)
+### JSON
-TODO
+Parses JSON events that are already in IntelMQ format.
+If the input data did not contain the field `classification.type`, it is set to `undetermined`.
+
+Supports multiple different modes:
+
+#### Input data is one event
+Example:
+```json
+{ INTELMQ data... }
+```
+or:
+```
+{
+ INTELMQ data...
+}
+```
+
+Configuration:
+* `splitlines`: False
+* `multiple_events`: False
+
+#### Input data is in JSON stream format
+Example:
+```json
+{ INTELMQ data... }
+{ INTELMQ data... }
+{ INTELMQ data... }
+```
+
+Configuration:
+* `splitlines`: True
+* `multiple_events`: False
+
+#### Input data is a list of events
+Example:
+```json
+[
+ { INTELMQ data... },
+ { INTELMQ data... },
+ ...
+]
+```
+
+Configuration:
+* `splitlines`: False
+* `multiple_events`: True
+
+#### Configuration
**Module:** `intelmq.bots.parsers.json.parser`
+**Parameters:**
+
+**`splitlines`**
+
+(optional, boolean) When the input file contains one JSON dictionary per line, set this to `true`. Defaults to `false`.
+
+**`multiple_events`**
+
+(optional, string) When the input file contains a JSON list of dictionaries, set this to `true`. Defaults to `false`.
+
---
### Key=Value Parser
diff --git a/intelmq/bots/parsers/json/parser.py b/intelmq/bots/parsers/json/parser.py
index f66a5a7410..37713e9f99 100644
--- a/intelmq/bots/parsers/json/parser.py
+++ b/intelmq/bots/parsers/json/parser.py
@@ -1,38 +1,48 @@
-# SPDX-FileCopyrightText: 2016 by Bundesamt für Sicherheit in der Informationstechnik
+# SPDX-FileCopyrightText: 2016 by Bundesamt für Sicherheit in der Informationstechnik, 2016-2021 nic.at GmbH, 2024 Tim de Boer, 2025 Institute for Common Good Technology
#
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
JSON Parser Bot
Retrieves a base64 encoded JSON-String from raw and converts it into an
event.
-
-Copyright (C) 2016 by Bundesamt für Sicherheit in der Informationstechnik
-Software engineering by Intevation GmbH
"""
from intelmq.lib.bot import ParserBot
from intelmq.lib.message import MessageFactory
from intelmq.lib.utils import base64_decode
+from json import loads as json_loads, dumps as json_dumps
class JSONParserBot(ParserBot):
"""Parse IntelMQ-JSON data"""
- splitlines = False
+ splitlines: bool = False
+ multiple_events: bool = False
+
+ def init(self):
+ if self.multiple_events and self.splitlines:
+ raise ValueError("Modes 'splitlines' and 'multiple_events' are not possible at the same time. Please use either one.")
def process(self):
report = self.receive_message()
- if self.splitlines:
- lines = base64_decode(report['raw']).splitlines()
+ if self.multiple_events:
+ lines = json_loads(base64_decode(report["raw"]))
+ elif self.splitlines:
+ lines = base64_decode(report["raw"]).splitlines()
else:
- lines = [base64_decode(report['raw'])]
+ lines = [base64_decode(report["raw"])]
for line in lines:
- new_event = MessageFactory.unserialize(line,
- harmonization=self.harmonization,
- default_type='Event')
event = self.new_event(report)
- event.update(new_event)
- if 'raw' not in event:
- event['raw'] = line
+ if self.multiple_events:
+ event.update(MessageFactory.from_dict(line,
+ harmonization=self.harmonization,
+ default_type="Event"))
+ event["raw"] = json_dumps(line, sort_keys=True)
+ else:
+ event.update(MessageFactory.unserialize(line,
+ harmonization=self.harmonization,
+ default_type="Event"))
+ event.add('raw', line, overwrite=False)
+ event.add("classification.type", "undetermined", overwrite=False) # set to undetermined if input has no classification
self.send_message(event)
self.acknowledge_message()
diff --git a/intelmq/lib/message.py b/intelmq/lib/message.py
index 627ebbb671..eeff508722 100644
--- a/intelmq/lib/message.py
+++ b/intelmq/lib/message.py
@@ -49,10 +49,10 @@ def from_dict(message: dict, harmonization=None,
MessageFactory.unserialize
MessageFactory.serialize
"""
- if default_type and "__type" not in message:
- message["__type"] = default_type
+ if not default_type and '__type' not in message:
+ raise ValueError("Message type could not be determined. Input message misses '__type' and parameter 'default_type' not given.")
try:
- class_reference = getattr(intelmq.lib.message, message["__type"])
+ class_reference = getattr(intelmq.lib.message, message.get("__type", default_type))
except AttributeError:
raise exceptions.InvalidArgument('__type',
got=message["__type"],
@@ -60,6 +60,8 @@ def from_dict(message: dict, harmonization=None,
docs=HARMONIZATION_CONF_FILE)
# don't modify the parameter
message_copy = message.copy()
+ if default_type and "__type" not in message_copy:
+ message_copy["__type"] = default_type
del message_copy["__type"]
return class_reference(message_copy, auto=True, harmonization=harmonization)
diff --git a/intelmq/tests/bots/parsers/json/ncscnl.json b/intelmq/tests/bots/parsers/json/ncscnl.json
new file mode 100644
index 0000000000..b822522f93
--- /dev/null
+++ b/intelmq/tests/bots/parsers/json/ncscnl.json
@@ -0,0 +1,68 @@
+[
+ {
+ "extra.dataset_collections": "0",
+ "extra.dataset_files": "1",
+ "extra.dataset_infected": "false",
+ "extra.dataset_ransom": "null",
+ "extra.dataset_rows": "0",
+ "extra.dataset_size": "301",
+ "protocol.application": "https",
+ "protocol.transport": "tcp",
+ "source.asn": 12345689,
+ "source.fqdn": "fqdn-example-1.tld",
+ "source.geolocation.cc": "NL",
+ "source.geolocation.city": "Enschede",
+ "source.geolocation.latitude": 52.0000000000000,
+ "source.geolocation.longitude": 6.0000000000000,
+ "source.geolocation.region": "Overijssel",
+ "source.ip": "127.1.2.1",
+ "source.network": "127.1.0.0/16",
+ "source.port": 80,
+ "time.source": "2024-12-16T02:08:06+00:00"
+ },
+ {
+ "extra.dataset_collections": "0",
+ "extra.dataset_files": "1",
+ "extra.dataset_infected": "false",
+ "extra.dataset_ransom": "null",
+ "extra.dataset_rows": "0",
+ "extra.dataset_size": "615",
+ "extra.os_name": "Ubuntu",
+ "extra.software": "Apache",
+ "extra.tag": "rescan",
+ "extra.version": "2.4.58",
+ "protocol.application": "https",
+ "protocol.transport": "tcp",
+ "source.asn": 12345689,
+ "source.fqdn": "fqdn-example-2.tld",
+ "source.geolocation.cc": "NL",
+ "source.geolocation.city": "Eindhoven",
+ "source.geolocation.latitude": 51.0000000000000,
+ "source.geolocation.longitude": 5.0000000000000,
+ "source.geolocation.region": "North Brabant",
+ "source.ip": "127.1.2.2",
+ "source.network": "127.1.0.0/16",
+ "source.port": 443,
+ "time.source": "2024-12-16T02:08:12+00:00"
+ },
+ {
+ "extra.dataset_collections": "0",
+ "extra.dataset_files": "1",
+ "extra.dataset_infected": "false",
+ "extra.dataset_ransom": "null",
+ "extra.dataset_rows": "0",
+ "extra.dataset_size": "421",
+ "protocol.application": "http",
+ "protocol.transport": "tcp",
+ "source.asn": 12345689,
+ "source.geolocation.cc": "NL",
+ "source.geolocation.city": "Enschede",
+ "source.geolocation.latitude": 52.0000000000000,
+ "source.geolocation.longitude": 6.0000000000000,
+ "source.geolocation.region": "Overijssel",
+ "source.ip": "127.1.2.3",
+ "source.network": "127.1.0.0/16",
+ "source.port": 9000,
+ "time.source": "2024-12-15T21:09:49+00:00"
+ }
+]
diff --git a/intelmq/tests/bots/parsers/json/ncscnl.json.license b/intelmq/tests/bots/parsers/json/ncscnl.json.license
new file mode 100644
index 0000000000..14f82d872c
--- /dev/null
+++ b/intelmq/tests/bots/parsers/json/ncscnl.json.license
@@ -0,0 +1,2 @@
+SPDX-FileCopyrightText: 2024 Tim de Boer
+SPDX-License-Identifier: AGPL-3.0-or-later
diff --git a/intelmq/tests/bots/parsers/json/test_parser.py b/intelmq/tests/bots/parsers/json/test_parser.py
index c18d18dd00..2e381e47c0 100644
--- a/intelmq/tests/bots/parsers/json/test_parser.py
+++ b/intelmq/tests/bots/parsers/json/test_parser.py
@@ -6,6 +6,7 @@
import base64
import os
import unittest
+from json import loads as json_loads, dumps as json_dumps
import intelmq.lib.test as test
from intelmq.bots.parsers.json.parser import JSONParserBot
@@ -51,6 +52,21 @@
NO_DEFAULT_EVENT = MULTILINE_EVENTS[1].copy()
NO_DEFAULT_EVENT['raw'] = base64.b64encode(b'{"source.ip": "127.0.0.2", "classification.type": "c2-server"}\n').decode()
+with open(os.path.join(os.path.dirname(__file__), 'ncscnl.json'), 'rb') as fh:
+ NCSCNL_FILE = fh.read()
+NCSCNL_RAW = base64.b64encode(NCSCNL_FILE).decode()
+NCSC_EVENTS = json_loads(NCSCNL_FILE)
+for i, event in enumerate(NCSC_EVENTS):
+ NCSC_EVENTS[i]['raw'] = base64.b64encode(json_dumps(event, sort_keys=True).encode()).decode()
+ NCSC_EVENTS[i]['classification.type'] = 'undetermined'
+ NCSC_EVENTS[i]['feed.name'] = 'NCSC.NL'
+ NCSC_EVENTS[i]['__type'] = 'Event'
+
+NCSCNL_REPORT = {"feed.name": "NCSC.NL",
+ "raw": NCSCNL_RAW,
+ "__type": "Report",
+ }
+
class TestJSONParserBot(test.BotTestCase, unittest.TestCase):
"""
@@ -70,8 +86,7 @@ def test_oneline_report(self):
def test_multiline_report(self):
""" Test if correct Event has been produced. """
self.input_message = MULTILINE_REPORT
- self.sysconfig = {"splitlines": True}
- self.run_bot()
+ self.run_bot(parameters={"splitlines": True})
self.assertMessageEqual(0, MULTILINE_EVENTS[0])
self.assertMessageEqual(1, MULTILINE_EVENTS[1])
@@ -81,6 +96,14 @@ def test_default_event(self):
self.run_bot()
self.assertMessageEqual(0, NO_DEFAULT_EVENT)
+ def test_ncscnl(self):
+ """ A file containing a list of events (not per line) """
+ self.input_message = NCSCNL_REPORT
+ self.run_bot(parameters={'multiple_events': True})
+ self.assertMessageEqual(0, NCSC_EVENTS[0])
+ self.assertMessageEqual(1, NCSC_EVENTS[1])
+ self.assertMessageEqual(2, NCSC_EVENTS[2])
+
if __name__ == '__main__': # pragma: no cover
unittest.main()
diff --git a/intelmq/tests/lib/test_bot_library_mode.py b/intelmq/tests/lib/test_bot_library_mode.py
index f1a50db7d5..93a977a2f8 100644
--- a/intelmq/tests/lib/test_bot_library_mode.py
+++ b/intelmq/tests/lib/test_bot_library_mode.py
@@ -32,6 +32,7 @@
"destination.ip": "192.0.43.8", # iana.org.
"time.observation": "2015-01-01T00:00:00+00:00",
}
+EXAMPLE_IP_OUTPUT = MessageFactory.from_dict(EXAMPLE_IP_INPUT, default_type='Event') # adds __type = Event
class BrokenInitExpertBot(ExpertBot):
@@ -130,7 +131,7 @@ def test_bot_multi_message():
def test_bot_raises_and_second_message():
"""
- The first message raises an error and the second message
+ The first message raises an error and the second message is processed correctly
This test is based on an issue where the exception-raising message was not cleared from the internal message store of the Bot/Pipeline instance and thus re-used on the second run
"""
raises_on_first_run = RaisesOnFirstRunExpertBot('raises', settings=BotLibSettings)
@@ -138,7 +139,7 @@ def test_bot_raises_and_second_message():
raises_on_first_run.process_message(EXAMPLE_DATA_URL)
queues = raises_on_first_run.process_message(EXAMPLE_IP_INPUT)
assert len(queues['output']) == 1
- assertMessageEqual(queues['output'][0], EXAMPLE_IP_INPUT)
+ assertMessageEqual(queues['output'][0], EXAMPLE_IP_OUTPUT)
if __name__ == '__main__': # pragma: no cover