From 54573a21081ca677f17c03599b0c2774deb32c42 Mon Sep 17 00:00:00 2001
From: Katie Worton <katie.worton@linaro.org>
Date: Wed, 3 Jul 2024 12:28:28 +0100
Subject: [PATCH] [RFC]log-parser-testing: Create testing tool for log parser

Add a test script for log parser patterns. The library components from
this code can later be pulled out of this script and into a library to
allow for reuse.

Signed-off-by: Katie Worton <katie.worton@linaro.org>
---
 log-parser-testing.py | 299 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 299 insertions(+)
 create mode 100755 log-parser-testing.py
diff --git a/log-parser-testing.py b/log-parser-testing.py
new file mode 100755
index 0000000..44474e1
--- /dev/null
+++ b/log-parser-testing.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+
+import argparse
+import hashlib
+import logging
+import re
+from collections import defaultdict
+from pathlib import Path
+
+from squad_client.core.api import SquadApi
+from squad_client.core.models import TestRun
+from tuxrun.utils import slugify
+
+from squadutilslib import get_file
+
+SquadApi.configure(cache=3600, url="https://qa-reports.linaro.org/")
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+REGEX_NAME = 0
+REGEX_BODY = 1
+REGEX_EXTRACTED_NAME = 2
+
+MULTILINERS = [
+    (
+        "check-kernel-exception",
+        r"-+\[? cut here \]?-+.*?-+\[? end trace \w* \]?-+",
+        r"\d][^\+\n]*",
+    ),
+    (
+        "check-kernel-kasan",
+        r"=+\n\[[\s\.\d]+\]\s+BUG: KASAN:.*?=+",
+        r"BUG: KASAN:[^\+\n]*",
+    ),
+    (
+        "check-kernel-kfence",
+        r"=+\n\[[\s\.\d]+\]\s+BUG: KFENCE:.*?=+",
+        r"BUG: KFENCE:[^\+\n]*",
+    ),
+]
+
+ONELINERS = [
+    (
+        "check-kernel-oops",
+        r"^[^\n]+Oops(?: -|:).*?$",
+        r"Oops[^\+\n]*",
+    ),
+    (
+        "check-kernel-fault",
+        r"^[^\n]+Unhandled fault.*?$",
+        r"Unhandled [^\+\n]*",
+    ),
+    (
+        "check-kernel-warning",
+        r"^[^\n]+WARNING:.*?$",
+        r"WARNING: [^\+\n]*",
+    ),
+    (
+        "check-kernel-bug",
+        r"^[^\n]+(?: kernel BUG at|BUG:).*?$",
+        r"BUG: [^\+\n]*",
+    ),
+    (
+        "check-kernel-invalid-opcode",
+        r"^[^\n]+invalid opcode:.*?$",
+        r"invalid opcode: [^\+\n]*",
+    ),
+    (
+        "check-kernel-panic",
+        r"Kernel panic - not syncing.*?$",
+        r"Kernel [^\+\n]*",
+    ),
+]
+
+# Tip: broader regexes should come first
+REGEXES = MULTILINERS + ONELINERS
+
+
+def arg_parser():
+    parser = argparse.ArgumentParser(description="Test out SQUAD log parser patterns")
+
+    parser.add_argument(
+        "--file",
+        help="Path to a single log file to test",
+    )
+
+    parser.add_argument(
+        "--directory",
+        help="Path to a directory containing one or more log files named `full_log.log`. Log parser outputs will be places in a directory next to each full log.",
+    )
+
+    parser.add_argument(
+        "--testrun",
+        help="SQUAD TestRun ID for a single TestRun log to test. This will download the file to `./<testrun>.log` then run the log parser",
+    )
+
+    return parser
+
+
+class LogParserLib:
+    """
+    This should later be moved to a common library that both SQUAD and the log
+    parser testing script can use.
+    """
+
+    @classmethod
+    def __compile_regexes(self, regexes):
+        combined = [r"(%s)" % r[REGEX_BODY] for r in regexes]
+        return re.compile(r"|".join(combined), re.S | re.M)
+
+    @classmethod
+    def __join_matches(self, matches, regexes):
+        """
+        group regex in python are returned as a list of tuples which each
+        group match in one of the positions in the tuple. Example:
+        regex = r'(a)|(b)|(c)'
+        matches = [
+            ('match a', '', ''),
+            ('', 'match b', ''),
+            ('match a', '', ''),
+            ('', '', 'match c')
+        ]
+        """
+        snippets = {regex_id: [] for regex_id in range(len(regexes))}
+        for match in matches:
+            for regex_id in range(len(regexes)):
+                if len(match[regex_id]) > 0:
+                    snippets[regex_id].append(match[regex_id])
+        return snippets
+
+    @classmethod
+    def __create_shasum(self, snippet):
+        sha = hashlib.sha256()
+        without_numbers = re.sub(r"(0x[a-f0-9]+|[<\[][0-9a-f]+?[>\]]|\d+)", "", snippet)
+        without_time = re.sub(r"^\[[^\]]+\]", "", without_numbers)
+        sha.update(without_time.encode())
+        return sha.hexdigest()
+
+    @classmethod
+    def __create_name(self, snippet, regex):
+        matches = regex.findall(snippet)
+        if not matches:
+            return None
+        snippet = matches[0]
+        without_numbers = re.sub(r"(0x[a-f0-9]+|[<\[][0-9a-f]+?[>\]]|\d+)", "", snippet)
+        without_time = re.sub(r"^\[[^\]]+\]", "", without_numbers)
+        # Truncate the name so it is max 190 characters - this ensures the name
+        # + "-<sha>" will not be long than 255 characters
+        return slugify(without_time)[:190]
+
+    @classmethod
+    def __group_snippets_by_name(self, lines, test_name, test_name_regex):
+        """
+        There will be at least one test per regex. If there were any match for a given
+        regex, then a new test will be generated using test_name + shasum. This helps
+        comparing kernel logs across different builds
+        """
+        snippets_by_regex_name = dict()
+        snippets_by_generated_test_name_and_sha = dict()
+
+        snippets_by_regex_name[test_name] = "\n".join(lines)
+
+        # Some lines of the matched regex might be the same, and we don't want
+        # to create multiple tests like test1-sha1, test1-sha1, etc, so we'll
+        # create a set of sha1sums then create only new tests for unique sha's
+        shas = defaultdict(set)
+        for line in lines:
+            name_and_sha = ""
+            if test_name_regex:
+                name = self.__create_name(line, test_name_regex)
+                if name:
+                    name_and_sha = name + "-"
+            name_and_sha += self.__create_shasum(line)
+            shas[name_and_sha].add(line)
+
+        for sha, lines in shas.items():
+            name = f"{test_name}-{sha}"
+            snippets_by_generated_test_name_and_sha[name] = "\n---\n".join(lines)
+
+        return snippets_by_regex_name, snippets_by_generated_test_name_and_sha
+
+    @classmethod
+    def parse_log(self, log, regexes):
+        regex = self.__compile_regexes(regexes)
+        matches = regex.findall(log)
+        snippets = self.__join_matches(matches, regexes)
+
+        snippets_by_regex_name = dict()
+        snippets_by_generated_test_name_and_sha = dict()
+
+        for regex_id in range(len(regexes)):
+            test_name = REGEXES[regex_id][REGEX_NAME]
+            test_name_regex = None
+            if REGEXES[regex_id][REGEX_EXTRACTED_NAME] is not None:
+                test_name_regex = re.compile(
+                    REGEXES[regex_id][REGEX_EXTRACTED_NAME], re.S | re.M
+                )
+            (
+                snippets_by_regex_name[regex_id],
+                snippets_by_generated_test_name_and_sha[regex_id],
+            ) = self.__group_snippets_by_name(
+                snippets[regex_id], test_name, test_name_regex
+            )
+
+        return snippets_by_regex_name, snippets_by_generated_test_name_and_sha
+
+
+class LogParserUtilities:
+    @classmethod
+    def cutoff_boot_log(self, log):
+        # Attempt to split the log in " login:"
+        logs = log.split(" login:", 1)
+
+        # 1 string means no split was done, consider all logs as test log
+        if len(logs) == 1:
+            return "", log
+
+        boot_log = logs[0]
+        test_log = logs[1]
+        return boot_log, test_log
+
+    @classmethod
+    def kernel_msgs_only(self, log):
+        kernel_msgs = re.findall(r"(\[[ \d]+\.[ \d]+\] .*?)$", log, re.S | re.M)
+        return "\n".join(kernel_msgs)
+
+
+class LogParserTester:
+
+    @classmethod
+    def post_process_log(self, log):
+        boot_log, test_log = LogParserUtilities.cutoff_boot_log(log)
+        logs = {
+            "boot": boot_log,
+            "test": test_log,
+        }
+
+        for log_type, log in logs.items():
+            log = LogParserUtilities.kernel_msgs_only(log)
+            suite_slug = f"log-parser-{log_type}"
+            logger.info(f"suite slug: {suite_slug}")
+            snippets_by_regex_name, snippets_by_generated_test_name_and_sha = (
+                LogParserLib.parse_log(log, REGEXES)
+            )
+
+        return snippets_by_generated_test_name_and_sha, snippets_by_regex_name
+
+
+if __name__ == "__main__":
+    args = arg_parser().parse_args()
+    if args.testrun:
+        file = get_file(TestRun(args.testrun).log_file, f"{args.testrun}.log")
+        log = Path(file).read_text()
+        LogParserTester.post_process_log(log)
+    if args.file:
+        if Path(args.file).is_file():
+            log = Path(args.file).read_text()
+            LogParserTester.post_process_log(log)
+        else:
+            logger.error(f"{args.file} is not a file.")
+    if args.directory:
+        log_dir_by_regex_name = "logs_regex_name"
+        log_dir_by_generated_test_name_and_sha = "logs_generated_test_name_and_sha"
+        if Path(args.directory).is_dir():
+            logs = list(Path(args.directory).rglob("full_log.log"))
+            for log_file_path in logs:
+                log = Path(log_file_path).read_text()
+                snippets_by_generated_test_name_and_sha, snippets_by_regex_name = (
+                    LogParserTester.post_process_log(log)
+                )
+                for (
+                    snippet_id,
+                    snippets,
+                ) in snippets_by_generated_test_name_and_sha.items():
+                    for test_name, log_lines in snippets.items():
+                        extracted_logs_path = (
+                            log_file_path.parent
+                            / Path(log_dir_by_generated_test_name_and_sha)
+                            / Path(test_name)
+                        )
+                        extracted_logs_path.parent.mkdir(parents=True, exist_ok=True)
+                        extracted_logs_path.write_text(log_lines)
+                        logger.info(extracted_logs_path)
+                for (
+                    snippet_id,
+                    snippets,
+                ) in snippets_by_regex_name.items():
+                    for test_name, log_lines in snippets.items():
+                        extracted_logs_path = (
+                            log_file_path.parent
+                            / Path(log_dir_by_regex_name)
+                            / Path(test_name)
+                        )
+                        extracted_logs_path.parent.mkdir(parents=True, exist_ok=True)
+                        extracted_logs_path.write_text(log_lines)
+                        logger.info(extracted_logs_path)
+        else:
+            logger.error(f"{args.file} is not a directory.")