intelowlproject
diff --git a/‎.env_template‎
Lines changed: 1 addition & 1 deletion b/‎.env_template‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎api/views/feeds.py‎
Lines changed: 7 additions & 7 deletions b/‎api/views/feeds.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎api/views/utils.py‎
Lines changed: 8 additions & 4 deletions b/‎api/views/utils.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎docker/.version‎
Lines changed: 1 addition & 1 deletion b/‎docker/.version‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/Dockerfile_nginx‎
Lines changed: 1 addition & 1 deletion b/‎docker/Dockerfile_nginx‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎greedybear/admin.py‎
Lines changed: 28 additions & 6 deletions b/‎greedybear/admin.py‎
Lines changed: 28 additions & 6 deletions
diff --git a/‎greedybear/celery.py‎
Lines changed: 8 additions & 1 deletion b/‎greedybear/celery.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎greedybear/cronjobs/attacks.py‎
Lines changed: 15 additions & 2 deletions b/‎greedybear/cronjobs/attacks.py‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎greedybear/cronjobs/commands/cluster.py‎
Lines changed: 2 additions & 77 deletions b/‎greedybear/cronjobs/commands/cluster.py‎
Lines changed: 2 additions & 77 deletions
@@ -13,4 +13,4 @@ COMPOSE_FILE=docker/default.yml:docker/local.override.yml
 #COMPOSE_FILE=docker/default.yml:docker/local.override.yml:docker/elasticsearch.yml
 
 # If you want to run a specific version, populate this
-# REACT_APP_INTELOWL_VERSION="1.6.4"
+# REACT_APP_INTELOWL_VERSION="1.6.5"
@@ -25,7 +25,7 @@ def feeds(request, feed_type, attack_type, prioritize, format_):
         attack_type (str): Type of attack (e.g., all, specific attack types).
         prioritize (str): Prioritization mechanism to use (e.g., recent, persistent).
         format_ (str): Desired format of the response (e.g., json, csv, txt).
-        exclude_mass_scanners (bool): query parameter flag to exclude IOCs that are known mass scanners.
+        include_mass_scanners (bool): query parameter flag to include IOCs that are known mass scanners.
 
     Returns:
         Response: The HTTP response with formatted IOC data.
@@ -34,8 +34,8 @@ def feeds(request, feed_type, attack_type, prioritize, format_):
 
     feed_params = FeedRequestParams({"feed_type": feed_type, "attack_type": attack_type, "format_": format_})
     feed_params.set_prioritization(prioritize)
-    if request.query_params and "exclude_mass_scanners" in request.query_params:
-        feed_params.exclude_mass_scanners()
+    if request.query_params and "include_mass_scanners" in request.query_params:
+        feed_params.include_mass_scanners()
 
     valid_feed_types = get_valid_feed_types()
     iocs_queryset = get_queryset(request, feed_params, valid_feed_types)
@@ -59,8 +59,8 @@ def feeds_pagination(request):
     feed_params = FeedRequestParams(request.query_params)
     feed_params.format = "json"
     feed_params.set_prioritization(request.query_params.get("prioritize"))
-    if request.query_params and "exclude_mass_scanners" in request.query_params:
-        feed_params.exclude_mass_scanners()
+    if request.query_params and "include_mass_scanners" in request.query_params:
+        feed_params.include_mass_scanners()
 
     valid_feed_types = get_valid_feed_types()
     iocs_queryset = get_queryset(request, feed_params, valid_feed_types)
@@ -83,8 +83,8 @@ def feeds_advanced(request):
         attack_type (str): Type of attack to filter. (supported: `scanner`, `payload_request`, `all`; default: `all`)
         max_age (int): Maximum number of days since last occurrence. E.g. an IOC that was last seen 4 days ago is excluded by default. (default: 3)
         min_days_seen (int): Minimum number of days on which an IOC must have been seen. (default: 1)
-        include_reputation (str): `;`-separated list of reputation values to include, e.g. `known attacker` or `known attacker;` to include IOCs without reputation. (default: include all)
-        exclude_reputation (str): `;`-separated list of reputation values to exclude, e.g. `mass scanner` or `mass scanner;bot, crawler`. (default: exclude none)
+        include_reputation (str): `;`-separated list of reputation values to include, e.g. `known attacker` or `known attacker;` to include IOCs without reputation. (default: include all) this has precedence over exclusion
+        exclude_reputation (str): `;`-separated list of reputation values to exclude, e.g. `mass scanner` or `mass scanner;bot, crawler`. (default: exclude mass scanners)
         feed_size (int): Number of IOC items to return. (default: 5000)
         ordering (str): Field to order results by, with optional `-` prefix for descending. (default: `-last_seen`)
         verbose (bool): `true` to include IOC properties that contain a lot of data, e.g. the list of days it was seen. (default: `false`)
 
@@ -11,7 +11,7 @@
 from django.contrib.postgres.aggregates import ArrayAgg
 from django.db.models import F, Q
 from django.http import HttpResponse, HttpResponseBadRequest, StreamingHttpResponse
-from greedybear.consts import FEEDS_LICENSE, PAYLOAD_REQUEST, SCANNER
+from greedybear.consts import FEEDS_LICENSE
 from greedybear.models import IOC, GeneralHoneypot, Statistics
 from greedybear.settings import EXTRACTION_INTERVAL
 from rest_framework import status
@@ -75,10 +75,11 @@ def __init__(self, query_params: dict):
         self.paginate = query_params.get("paginate", "false").lower()
         self.format = query_params.get("format_", "json").lower()
         self.feed_type_sorting = None
-
-    def exclude_mass_scanners(self):
         self.exclude_reputation.append("mass scanner")
 
+    def include_mass_scanners(self):
+        self.exclude_reputation.remove("mass scanner")
+
     def set_prioritization(self, prioritize: str):
         match prioritize:
             case "recent":
@@ -154,11 +155,14 @@ def get_queryset(request, feed_params, valid_feed_types):
         query_dict["number_of_days_seen__gte"] = int(feed_params.min_days_seen)
     if feed_params.include_reputation:
         query_dict["ip_reputation__in"] = feed_params.include_reputation
+        for reputation_type in feed_params.include_reputation:
+            if reputation_type in feed_params.exclude_reputation:
+                feed_params.exclude_reputation.remove(reputation_type)
 
     iocs = (
         IOC.objects.filter(**query_dict)
         .filter(Q(cowrie=True) | Q(log4j=True) | Q(general_honeypot__active=True))
-        .exclude(ip_reputation__in=feed_params.exclude_reputation)
+        .exclude(Q() if "nothing" in feed_params.exclude_reputation else Q(ip_reputation__in=feed_params.exclude_reputation))
         .annotate(value=F("name"))
         .annotate(honeypots=ArrayAgg("general_honeypot__name"))
         .order_by(feed_params.ordering)[: int(feed_params.feed_size)]
 
@@ -1 +1 @@
-REACT_APP_GREEDYBEAR_VERSION="1.6.4"
+REACT_APP_GREEDYBEAR_VERSION="1.6.5"
@@ -1,4 +1,4 @@
-FROM library/nginx:1.27.5-alpine
+FROM library/nginx:1.29.0-alpine
 RUN mkdir -p /var/cache/nginx /var/cache/nginx/feeds
 RUN apk update && apk upgrade && apk add bash
 ENV NGINX_LOG_DIR=/var/log/nginx
 
@@ -5,14 +5,30 @@
 from django.contrib import admin, messages
 from django.db.models import Q
 from django.utils.translation import ngettext
-from greedybear.models import IOC, CommandSequence, CowrieSession, GeneralHoneypot
+from greedybear.models import IOC, CommandSequence, CowrieSession, GeneralHoneypot, MassScanners, Sensors, Statistics
 
 logger = logging.getLogger(__name__)
 
-# there is no need to view the sensors in the admin page.
-# @admin.register(Sensors)
-# class SensorsModelAdmin(admin.ModelAdmin):
-#     list_display = [field.name for field in Sensors._meta.get_fields()]
+
+@admin.register(Sensors)
+class SensorsModelAdmin(admin.ModelAdmin):
+    list_display = [field.name for field in Sensors._meta.get_fields()]
+
+
+@admin.register(Statistics)
+class StatisticsModelAdmin(admin.ModelAdmin):
+    list_display = ["source", "view", "request_date"]
+    list_filter = ["source"]
+    search_fields = ["source"]
+    search_help_text = ["search for the IP address source"]
+
+
+@admin.register(MassScanners)
+class MassScannersModelAdmin(admin.ModelAdmin):
+    list_display = ["ip_address", "added", "reason"]
+    list_filter = ["reason"]
+    search_fields = ["ip_address"]
+    search_help_text = ["search for the IP address source"]
 
 
 class SessionInline(admin.TabularInline):
@@ -28,13 +44,17 @@ class SessionInline(admin.TabularInline):
 class CowrieSessionModelAdmin(admin.ModelAdmin):
     list_display = ["session_id", "start_time", "duration", "login_attempt", "credentials", "command_execution", "interaction_count", "source"]
     search_fields = ["source__name"]
+    search_help_text = ["search for the IP address source"]
     raw_id_fields = ["source", "commands"]
+    list_filter = ["login_attempt", "command_execution"]
 
 
 @admin.register(CommandSequence)
 class CommandSequenceModelAdmin(admin.ModelAdmin):
-    list_display = ["first_seen", "last_seen", "cluster", "commands"]
+    list_display = ["first_seen", "last_seen", "cluster", "commands", "commands_hash"]
     inlines = [SessionInline]
+    search_fields = ["source__name", "commands_hash"]
+    list_filter = ["cluster", "commands_hash"]
 
 
 @admin.register(IOC)
@@ -59,7 +79,9 @@ class IOCModelAdmin(admin.ModelAdmin):
         "destination_ports",
         "login_attempts",
     ]
+    list_filter = ["type", "log4j", "cowrie", "scanner", "payload_request", "ip_reputation", "asn"]
     search_fields = ["name", "related_ioc__name"]
+    search_help_text = ["search for the IP address source"]
     raw_id_fields = ["related_ioc"]
     filter_horizontal = ["general_honeypot"]
     inlines = [SessionInline]
 
@@ -103,7 +103,9 @@ def setup_loggers(*args, **kwargs):
     # This way models learn from complete rather than partial day patterns, which is crucial for their performance.
     "train_and_update": {
         "task": "greedybear.tasks.chain_train_and_update",
-        "schedule": crontab(hour=0, minute=hp_extraction_interval // 2),
+        # Sometimes this could start when the midnight extraction is not ended yet.
+        # Let's increment this a little.
+        "schedule": crontab(hour=0, minute=int(hp_extraction_interval / 3 * 2)),
         "options": {"queue": "default"},
     },
     # COMMANDS
@@ -119,4 +121,9 @@ def setup_loggers(*args, **kwargs):
         "schedule": crontab(hour=2, minute=3),
         "options": {"queue": "default"},
     },
+    "get_mass_scanners": {
+        "task": "greedybear.tasks.get_mass_scanners",
+        "schedule": crontab(hour=4, minute=3, day_of_week=0),
+        "options": {"queue": "default"},
+    },
 }
@@ -9,7 +9,7 @@
 from greedybear.cronjobs.base import ElasticJob
 from greedybear.cronjobs.scoring.scoring_jobs import UpdateScores
 from greedybear.cronjobs.sensors import ExtractSensors
-from greedybear.models import IOC, GeneralHoneypot, Sensors
+from greedybear.models import IOC, GeneralHoneypot, MassScanners, Sensors
 from greedybear.settings import EXTRACTION_INTERVAL, LEGACY_EXTRACTION
 
 
@@ -82,11 +82,12 @@ def _get_attacker_data(self, honeypot, fields: list) -> list:
             if not ip.strip():
                 continue
             dest_ports = [hit["dest_port"] for hit in hits if "dest_port" in hit]
+
             ioc = IOC(
                 name=ip,
                 type=self._get_ioc_type(ip),
                 interaction_count=len(hits),
-                ip_reputation=hits[0].get("ip_rep", ""),
+                ip_reputation=self._get_ip_reputation(ip, hits[0]),
                 asn=hits[0].get("geoip", {}).get("asn"),
                 destination_ports=sorted(set(dest_ports)),
                 login_attempts=len(hits) if honeypot.name == "Heralding" else 0,
@@ -98,6 +99,18 @@ def _get_attacker_data(self, honeypot, fields: list) -> list:
             iocs.append(ioc)
         return iocs
 
+    def _get_ip_reputation(self, ip, hit):
+        ip_reputation = hit.get("ip_rep", "")
+        if not ip_reputation:
+            try:
+                MassScanners.objects.get(ip_address=ip)
+            except MassScanners.DoesNotExist:
+                pass
+            else:
+                self.log.info(f"IP {ip} is a mass scanner")
+                ip_reputation = "mass scanner"
+        return ip_reputation
+
     def _update_scores(self):
         if not self.ioc_records:
             return
 
@@ -1,7 +1,6 @@
-import numpy as np
 from greedybear.cronjobs.base import Cronjob
+from greedybear.cronjobs.commands.lsh import LSHConnectedComponents
 from greedybear.models import CommandSequence
-from sklearn.cluster import DBSCAN
 
 
 def tokenize(sequence: list[str]) -> list[str]:
@@ -25,80 +24,6 @@ def tokenize(sequence: list[str]) -> list[str]:
     return result
 
 
-def jaccard_similarity(seq1: list[str], seq2: list[str]) -> float:
-    """
-    Calculate the Jaccard similarity coefficient between two sequences.
-
-    The Jaccard similarity coefficient is defined as the size of the intersection
-    divided by the size of the union of two sets. It ranges from 0 (completely dissimilar)
-    to 1 (identical).
-
-    Args:
-        seq1: First sequence of strings to compare
-        seq2: Second sequence of strings to compare
-
-    Returns:
-        float: Jaccard similarity coefficient between the two sequences.
-               Returns 0 if both sequences are empty.
-    """
-    set1 = set(seq1)
-    set2 = set(seq2)
-    intersection = len(set1.intersection(set2))
-    union = len(set1.union(set2))
-    return intersection / union if union != 0 else 0
-
-
-def compute_similarity_matrix(sequences: list[list[str]]) -> np.ndarray:
-    """
-    Compute a pairwise Jaccard similarity matrix for a list of sequences.
-
-    Creates a symmetric matrix where each element [i,j] contains the Jaccard
-    similarity between sequences[i] and sequences[j]. The diagonal elements
-    are set to 1.0 (self-similarity).
-
-    Time and space complexity: O(n²) where n is the number of sequences
-
-    Args:
-        sequences: List of token sequences to compare.
-
-    Returns:
-        np.ndarray: A symmetric n×n matrix of floats where n=len(sequences).
-    """
-    n = len(sequences)
-    matrix = np.zeros((n, n))
-    for i in range(n):
-        for j in range(i + 1, n):
-            similarity = jaccard_similarity(sequences[i], sequences[j])
-            matrix[i, j] = similarity
-            matrix[j, i] = similarity
-        matrix[i, i] = 1.0
-    return matrix
-
-
-def dbscan_clustering(sequences: list[list[str]], eps: float = 0.5) -> np.ndarray:
-    """
-    Cluster sequences using DBSCAN based on Jaccard similarity.
-
-    Performs density-based clustering on sequences using their pairwise Jaccard
-    similarities. The similarity is converted to distance by subtracting from 1.
-    Sequences with distance less than eps are considered neighbors.
-
-    Args:
-        sequences: List of token sequences to cluster.
-        eps: Maximum distance between two samples for them to be
-            considered as in the same neighborhood. Since we use Jaccard distance,
-            eps=0.5 means sequences must share at least 50% of their tokens to be
-            considered similar. Defaults to 0.5.
-
-    Returns:
-        np.ndarray: Array of cluster labels. Shape (n_samples,).
-    """
-    similarity_matrix = compute_similarity_matrix(sequences)
-    distance_matrix = 1 - similarity_matrix
-    dbscan = DBSCAN(eps=eps, min_samples=1, metric="precomputed")
-    return dbscan.fit_predict(distance_matrix)
-
-
 class ClusterCommandSequences(Cronjob):
     """
     A cronjob that clusters command sequences based on their similarity.
@@ -124,7 +49,7 @@ def run(self) -> None:
             return
         self.log.info(f"clustering {len(sequences)} command sequences")
         tokenized_seqs = [tokenize(s.commands) for s in sequences]
-        cluster_labels = dbscan_clustering(tokenized_seqs)
+        cluster_labels = LSHConnectedComponents().get_components(tokenized_seqs)
         seqs_to_update = []
         for seq, label in zip(sequences, cluster_labels):
             if seq.cluster != label:
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-REACT_APP_GREEDYBEAR_VERSION="1.6.4"`
	`1`	`+REACT_APP_GREEDYBEAR_VERSION="1.6.5"`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM library/nginx:1.27.5-alpine`
	`1`	`+FROM library/nginx:1.29.0-alpine`
`2`	`2`	`RUN mkdir -p /var/cache/nginx /var/cache/nginx/feeds`
`3`	`3`	`RUN apk update && apk upgrade && apk add bash`
`4`	`4`	`ENV NGINX_LOG_DIR=/var/log/nginx`