From 4335ef97560d7ccb39bfbd0c03e7b34519dbc76d Mon Sep 17 00:00:00 2001
From: Lukas Pukenis <lukas.pukenis@nordsec.com>
Date: Fri, 20 Dec 2024 12:41:39 +0200
Subject: [PATCH] Misalign batch testcase and add score

* Add 4 peers instead of 2 to batching testcase

* Misalign them by starting each with a delay. This
one is synthetic since in real world peers don't usually
start all at the same time and establish direct connections but
in testcase it's still worth to have.

* Misalign further by blocking alpha<->beta for a while
so they would establish direct connection later than
with other peers thus having a misalignment similar to real world.

* Add scoring to distribution histogram. It can now be
quantified how better/worse some solution or scenario is.
For now it's just added to be printed

Signed-off-by: Lukas Pukenis <lukas.pukenis@nordsec.com>
---
 nat-lab/tests/test_batching.py | 165 ++++++++++++++++++---------------
 nat-lab/tests/timeouts.py      |   2 +-
 nat-lab/tests/utils/traffic.py |   9 ++
 3 files changed, 102 insertions(+), 74 deletions(-)

diff --git a/nat-lab/tests/test_batching.py b/nat-lab/tests/test_batching.py
index 6ed0a5e74..1ac6788b5 100644
--- a/nat-lab/tests/test_batching.py
+++ b/nat-lab/tests/test_batching.py
@@ -2,10 +2,10 @@
 import itertools
 import pytest
 from contextlib import AsyncExitStack
-from helpers import SetupParameters, setup_environment, setup_mesh_nodes
-from itertools import zip_longest
+from helpers import setup_api, setup_connections, SetupParameters, setup_mesh_nodes
 from scapy.layers.inet import TCP, UDP, ICMP  # type: ignore
 from scapy.layers.l2 import ARP  # type: ignore
+from telio import Client
 from timeouts import TEST_BATCHING_TIMEOUT
 from typing import List
 from utils.asyncio_util import run_async_context
@@ -17,7 +17,6 @@
     FeatureBatching,
     EndpointProvider,
     RelayState,
-    LinkState,
     NodeState,
     PathType,
     TelioAdapterType,
@@ -29,17 +28,19 @@
     render_chart,
     generate_packet_distribution_histogram,
     generate_packet_delay_histogram,
+    get_ordered_histogram_score,
 )
 
-BATCHING_MISALIGN_S = 7
-BATCHING_CAPTURE_TIME = 120  # Tied to TEST_BATCHING_TIMEOUT
+BATCHING_CAPTURE_TIME = 130
+DOCKER_CONE_GW_2_IP = "10.0.254.2"
 
 
 def _generate_setup_parameters(
     conn_tag: ConnectionTag, adapter: TelioAdapterType, batching: bool
 ) -> SetupParameters:
-    features = features_with_endpoint_providers([EndpointProvider.STUN])
-
+    features = features_with_endpoint_providers(
+        [EndpointProvider.STUN, EndpointProvider.LOCAL]
+    )
     features.link_detection = FeatureLinkDetection(
         rtt_seconds=1, no_of_pings=1, use_for_downgrade=True
     )
@@ -75,27 +76,27 @@ def _generate_setup_parameters(
         ConnectionTag.DOCKER_CONE_CLIENT_2,
         TelioAdapterType.LINUX_NATIVE_TUN,
     ),
+    (
+        ConnectionTag.DOCKER_OPEN_INTERNET_CLIENT_1,
+        TelioAdapterType.LINUX_NATIVE_TUN,
+    ),
+    (
+        ConnectionTag.DOCKER_OPEN_INTERNET_CLIENT_2,
+        TelioAdapterType.LINUX_NATIVE_TUN,
+    ),
 ]
-# This test captures histograms of network activity to evaluate the effect of local batching in libtelio.
-# Since only local batching is implemented, no client-generated traffic should occur during the test.
-# External traffic (incoming data) could distort the histograms, and receive-data-triggered batching is
-# not yet supported in libtelio. The test setup is simple: all clients are interconnected and remain idle
-# for an extended period. This idle period allows for a visual observation.
-# Local batching will only have an effect of batching multiple local keepalives into one bundle but will
-# not do anything about syncing the keepalives between the peers.
 
 
 @pytest.mark.asyncio
 @pytest.mark.timeout(TEST_BATCHING_TIMEOUT)
 @pytest.mark.parametrize(
-    "setup_params,misalign_sleep_s,capture_duration",
+    "setup_params,capture_duration",
     [
         pytest.param(
             [
-                _generate_setup_parameters(conn_tag, adapter, True)
+                _generate_setup_parameters(conn_tag, adapter, False)
                 for conn_tag, adapter in ALL_NODES
             ],
-            BATCHING_MISALIGN_S,
             BATCHING_CAPTURE_TIME,
             marks=[
                 pytest.mark.batching,
@@ -103,10 +104,9 @@ def _generate_setup_parameters(
         ),
         pytest.param(
             [
-                _generate_setup_parameters(conn_tag, adapter, False)
+                _generate_setup_parameters(conn_tag, adapter, True)
                 for conn_tag, adapter in ALL_NODES
             ],
-            BATCHING_MISALIGN_S,
             BATCHING_CAPTURE_TIME,
             marks=[
                 pytest.mark.batching,
@@ -116,19 +116,38 @@ def _generate_setup_parameters(
 )
 async def test_batching(
     setup_params: List[SetupParameters],
-    misalign_sleep_s: int,
     capture_duration: int,
 ) -> None:
+    """Batch test generates environment where all peers idle after forming direct connections
+    packet capture is being used to observe how traffic flows and is then processed and displayed.
+    """
+
     async with AsyncExitStack() as exit_stack:
-        env = await exit_stack.enter_async_context(
-            setup_environment(exit_stack, setup_params)
+        api, nodes = setup_api(
+            [(instance.is_local, instance.ip_stack) for instance in setup_params]
+        )
+        connection_managers = await setup_connections(
+            exit_stack,
+            [
+                (
+                    instance.connection_tag,
+                    instance.connection_tracker_config,
+                )
+                for instance in setup_params
+            ],
         )
 
-        await asyncio.gather(*[
-            client.wait_for_state_on_any_derp([RelayState.CONNECTED])
-            for client, instance in zip_longest(env.clients, setup_params)
-            if instance.derp_servers != []
-        ])
+        clients = []
+        for node, conn_man, params in zip(nodes, connection_managers, setup_params):
+            client = Client(
+                conn_man.connection, node, params.adapter_type_override, params.features
+            )
+            clients.append(client)
+
+        alpha_client, beta_client, *_ = clients
+        alpha_node, beta_node, *_ = nodes
+
+        # Start capture tasks
 
         # We capture the traffic from all nodes and gateways.
         # On gateways we are sure the traffic has left the machine, however no easy way to
@@ -136,33 +155,49 @@ async def test_batching(
         # client traffic can be inspected.
         gateways = [DOCKER_GW_MAP[param.connection_tag] for param in setup_params]
         gateway_container_names = [container_id(conn_tag) for conn_tag in gateways]
-        conns = [client.get_connection() for client in env.clients]
+        conns = [client.get_connection() for client in clients]
         node_container_names = [
             conn.container_name()
             for conn in conns
             if isinstance(conn, DockerConnection)
         ]
 
-        container_names = gateway_container_names + node_container_names
+        container_names = sorted(
+            list(set(gateway_container_names + node_container_names))
+        )
+
         print("Will capture batching on containers: ", container_names)
-        cnodes = zip(env.clients, env.nodes)
+        pcap_capture_tasks = []
+        for name in container_names:
+            pcap_task = asyncio.create_task(
+                capture_traffic(
+                    name,
+                    capture_duration,
+                )
+            )
+            pcap_capture_tasks.append(pcap_task)
 
-        # Misalign the peers by first stopping all of them and then restarting after various delays.
-        # This will have an effect of forcing neighboring libtelio node to add the peer to internal lists
-        # for keepalives at various points in time thus allowing us to observe better
-        # if the local batching is in action.
-        for client in env.clients:
-            await client.stop_device()
+        async def delayed_task(delay, node, client):
+            await asyncio.sleep(delay)
+            return await exit_stack.enter_async_context(
+                client.run(api.get_meshnet_config(node.id))
+            )
+
+        tasks = []
+        for i, (client, node) in enumerate(zip(clients, nodes)):
+            tasks.append(asyncio.create_task(delayed_task(i * 3, node, client)))
 
-        # misalign the peers by sleeping some before starting each node again
-        async def start_node_manually(client, node, sleep_s):
-            await asyncio.sleep(sleep_s)
-            await client.simple_start()
-            await client.set_meshnet_config(env.api.get_meshnet_config(node.id))
+        # deliberately block direct connection alpha <-> beta. This will make alpha and beta still form direct connections with other peers
+        # but alpha <-> beta itself will form after a delay causing misalignment which represents real world keepalive flow better
+        async with AsyncExitStack() as exit_stack2:
+            await exit_stack2.enter_async_context(
+                alpha_client.get_router().disable_path(DOCKER_CONE_GW_2_IP),
+            )
+            await asyncio.sleep(20)
 
         await asyncio.gather(*[
-            start_node_manually(client, node, i * misalign_sleep_s)
-            for i, (client, node) in enumerate(cnodes)
+            client.wait_for_state_on_any_derp([RelayState.CONNECTED])
+            for client in [alpha_client, beta_client]
         ])
 
         await asyncio.gather(*[
@@ -173,55 +208,37 @@ async def start_node_manually(client, node, sleep_s):
                     )
                 )
             )
-            for client, node in itertools.product(env.clients, env.nodes)
+            for client, node in itertools.product(clients, nodes)
             if not client.is_node(node)
         ])
 
+        print("All peers directly interconnected")
+
         pyro5_ports = [
-            int(port) for port in {client.get_proxy_port() for client in env.clients}
+            int(port) for port in {client.get_proxy_port() for client in clients}
         ]
 
         print("Pyro ports", pyro5_ports)
-        # In general it's not great to filter traffic but for testing and observing
-        # it's crucial since it distorts the results. For example Pyro traffic is a constant stream of
-        # TCP packets
         allow_pcap_filters = [
             (
-                "No Pyro5, SSDP, ARP",
+                "No Pyro5 and no ARP",
                 lambda p: (
                     (
-                        (p.haslayer(UDP) or p.haslayer(TCP))
-                        and p.sport not in pyro5_ports
-                        and p.dport not in pyro5_ports
-                    )
-                    and (
-                        not p.haslayer(ICMP)
-                        or p.haslayer(ICMP)
-                        and p[ICMP].type in [0, 8]
-                    )
-                    and (
-                        p.haslayer(UDP)
-                        and p[UDP].sport != 1900
-                        and p[UDP].dport != 1900
+                        (not p.haslayer(TCP))
+                        or (
+                            p.haslayer(TCP)
+                            and p.sport not in pyro5_ports
+                            and p.dport not in pyro5_ports
+                        )
                     )
                     and (not p.haslayer(ARP))
                 ),
             ),
         ]
 
-        pcap_capture_tasks = []
-        for name in container_names:
-            pcap_task = asyncio.create_task(
-                capture_traffic(
-                    name,
-                    capture_duration,
-                )
-            )
-            pcap_capture_tasks.append(pcap_task)
+        is_batching_enabled = clients[0].get_features().batching is not None
 
         pcap_paths = await asyncio.gather(*pcap_capture_tasks)
-
-        is_batching_enabled = env.clients[0].get_features().batching is not None
         for container, pcap_path in zip(container_names, pcap_paths):
             distribution_hs = generate_packet_distribution_histogram(
                 pcap_path, capture_duration, allow_pcap_filters
@@ -243,6 +260,8 @@ async def start_node_manually(client, node, sleep_s):
             print("Delay chart below")
             print(delay_chart)
 
+            print("Score: ", get_ordered_histogram_score(delay_hs))
+
 
 def proxying_peer_parameters(clients: List[ConnectionTag]):
     def features():
diff --git a/nat-lab/tests/timeouts.py b/nat-lab/tests/timeouts.py
index 1dfb344c0..3b693906b 100644
--- a/nat-lab/tests/timeouts.py
+++ b/nat-lab/tests/timeouts.py
@@ -12,4 +12,4 @@
 TEST_NODE_STATE_FLICKERING_RELAY_TIMEOUT = 180
 TEST_NODE_STATE_FLICKERING_DIRECT_TIMEOUT = 180
 TEST_MESH_STATE_AFTER_DISCONNECTING_NODE_TIMEOUT = 300
-TEST_BATCHING_TIMEOUT = 1000
+TEST_BATCHING_TIMEOUT = 600
diff --git a/nat-lab/tests/utils/traffic.py b/nat-lab/tests/utils/traffic.py
index 1c73e1441..7fd608cd9 100644
--- a/nat-lab/tests/utils/traffic.py
+++ b/nat-lab/tests/utils/traffic.py
@@ -118,6 +118,15 @@ def generate_packet_delay_histogram(
     return generate_histogram(timestamps, buckets)
 
 
+def get_ordered_histogram_score(data: typing.List[int]) -> int:
+    # Assumes the histogram order matters and each item going to the right adds more to the score
+    # Useful to quantity a score for things like periods between packets
+    score = 0
+    for i, value in enumerate(data, start=1):
+        score += i * value
+    return score
+
+
 def generate_packet_distribution_histogram(
     pcap_path: str,
     buckets: int,