main: added script, helpers and README

lastoCHka42 · lastoCHka42 · commit 1cc3d07d5904 · 2024-05-15T10:27:09.000+03:00
This commit adds first version of the report2mermaid script.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,10 @@
+# Input and output files
+input_file.txt
+result.txt
+
+# Byte-compiled
+helpers/__pycache__/
+__pycache__/
+
+# Environment
+venv/
diff --git a/README.md b/README.md
@@ -0,0 +1,75 @@
+# DataFusion Explain output transformer to Mermaid format
+
+This script transforms DataFusion Explain Analyze output to Mermaid readable text format for simple graph generation.
+
+## Usage
+
+```
+python3 main.py [-h] [-f INPUT_FILE] [-o OUTPUT_FILE] [-d]
+```
+
+Convert DataFusion Explain Analyze to Mermaid
+
+options:
+  -h, --help                                    show this help message and exit
+  -f INPUT_FILE, --input-file INPUT_FILE        path to file with Explain Analyze data, default - `input_file.txt`
+  -o OUTPUT_FILE, --output-file OUTPUT_FILE     path to output file, default - `result.txt`
+  -d, --debug                                   print log to stdout
+
+## Sample input
+
+```
+CoalescePartitionsExec, metrics=[]
+   ProjectionExec: expr=[SUM(table.x)@1 as SUM(x)], metrics=[]
+     HashAggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[SUM(x)], metrics=[outputRows=2]
+       CoalesceBatchesExec: target_batch_size=4096, metrics=[]
+         RepartitionExec: partitioning=Hash([Column { name: "b", index: 0 }], 16), metrics=[sendTime=839560, fetchTime=122528525, repartitionTime=5327877]
+           HashAggregateExec: mode=Partial, gby=[b@1 as b], aggr=[SUM(x)], metrics=[outputRows=2]
+             RepartitionExec: partitioning=RoundRobinBatch(16), metrics=[fetchTime=5660489, repartitionTime=0, sendTime=8012]
+               CsvExec: file_groups={1 group: [[/tmp/table.csv]]}, has_header=false, metrics=[]
+```
+
+## Sample output
+
+```text
+graph BT
+
+0[<b>CoalescePartitionsExec</b>
+]
+1[<b>ProjectionExec</b>
+] --> 0
+2[<b>HashAggregateExec</b>
+outputRows=2] --> 1
+3[<b>CoalesceBatchesExec</b>
+] --> 2
+4[<b>RepartitionExec</b>
+sendTime=839560\n fetchTime=122528525\n repartitionTime=5327877] --> 3
+5[<b>HashAggregateExec</b>
+outputRows=2] --> 4
+6[<b>RepartitionExec</b>
+fetchTime=5660489\n repartitionTime=0\n sendTime=8012] --> 5
+7[<b>CsvExec</b>
+] --> 6
+```
+
+## Sample diagram
+```mermaid
+graph BT
+
+0[<b>CoalescePartitionsExec</b>
+]
+1[<b>ProjectionExec</b>
+] --> 0
+2[<b>HashAggregateExec</b>
+outputRows=2] --> 1
+3[<b>CoalesceBatchesExec</b>
+] --> 2
+4[<b>RepartitionExec</b>
+sendTime=839560\n fetchTime=122528525\n repartitionTime=5327877] --> 3
+5[<b>HashAggregateExec</b>
+outputRows=2] --> 4
+6[<b>RepartitionExec</b>
+fetchTime=5660489\n repartitionTime=0\n sendTime=8012] --> 5
+7[<b>CsvExec</b>
+] --> 6
+```
diff --git a/helpers/__init__.py b/helpers/__init__.py
diff --git a/helpers/helpers.py b/helpers/helpers.py
@@ -0,0 +1,28 @@
+# fill the dict with {index: indents_count} for indents show the level in graph
+from main import debug_mode
+
+def generate_index_to_level_dict(source):
+    result = dict()
+    for i in range(len(source)):
+        if not len(source[i].strip()):
+            continue
+        indents = len(source[i]) - len(source[i].lstrip())
+        result.update({i: indents})
+    return(result)
+
+
+def find_parent(index, indents, INDEX_TO_LEVEL):
+    index_to_level_items = INDEX_TO_LEVEL.items()
+    b = list(filter(lambda x: x[0] < index and x[1] == indents - 2,
+                    index_to_level_items))
+    if len(b) == 1:
+        return(b[0])
+    if len(b) > 1:
+        return(sorted(list(b), key=lambda x: x[1])[-1])
+
+
+def debug(line):
+    if not debug_mode:
+        pass
+    else:
+        print(line)
diff --git a/helpers/node.py b/helpers/node.py
@@ -0,0 +1,83 @@
+import json
+
+
+class Node():
+    def __init__(self, index, name, content, parent=None):
+        self.index = index
+        self.name = name
+        self.content = content
+        self.parent = parent
+        self.childrens = list()
+        self.metrics = self.content.split('metrics=')[-1].\
+            lstrip('[').rstrip(']').replace(',', '\\n')
+
+    def __repr__(self):
+        return(str(self.name))
+
+    def content_to_json(self):
+        # replace non-JSON symbols to JSON symbols, wrap strings to quotes
+        symbols_to_replace = {
+            ', ': '", "',
+            '=': '":"',
+            '[': '"{"',
+            ']': '"}"',
+            '""': '',
+            '}"}': '}}'
+        }
+        self.content_processor = '{"' + self.content.strip() + '}'
+        for s, d in symbols_to_replace.items():
+            self.content_processor = self.content_processor.replace(s, d)
+
+        # find all the cases of `{ no colon text }` and replace parents
+        # don't start from the zero index to avoid starting JSON replace
+        def replace_non_dict(start=1):
+            dict_start = self.content_processor.find('{', start)
+            dict_end = self.content_processor.find('}', dict_start)
+            colon = self.content_processor.find(':', dict_start, dict_end)
+            if colon == -1:
+                self.content_processor = self.content_processor[:dict_start] +\
+                    '[' + \
+                    self.content_processor[dict_start + 1:dict_end] + \
+                    ']' + \
+                    self.content_processor[dict_end + 1:]
+                print(self.content_processor)
+            if not dict_start == self.content_processor.rfind('{', start):
+                replace_non_dict(start=dict_end)
+
+        replace_non_dict()
+        try:
+            res = json.loads(self.content_processor)
+        except Exception as e:
+            res = f"""
+            >>>>>>>>>>>>>>>>ERROR
+            {self.content_processor}
+            >>>>>>>>>>>>>>>>{e}
+            """
+        # res = self.content_processor
+
+        print(res)
+
+    def add_parent(self, parent):
+        self.parent = parent
+        parent.childrens.append(self)
+
+    def mermaid_repr(self):
+        res = f"{self.index}[<b>{self.name}</b>\n{self.metrics}]"
+        if self.parent:
+            res += f" --> {self.parent.index}"
+        return(res)
+
+    def childrens_repr(self):
+        if not self.childrens:
+            return
+        return([children.index for children in self.childrens])
+
+    def pprint(self):
+        node_description = f"""
+NAME: {self.name}
+CONTENT: {self.content}
+PARENT: {self.parent.index if self.parent else None}
+CHILDRENS: {self.childrens_repr()}
+------------------------------------------------\n
+"""
+        return(node_description)
diff --git a/main.py b/main.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+import argparse
+
+from helpers.helpers import *
+from helpers.node import Node
+
+
+parser = argparse.ArgumentParser(description=
+                                'Convert DataFusion Explain Analyze \
+                                to Mermaid')
+parser.add_argument('-f', '--input-file',
+                    default='input_file.txt',
+                    help='path to file with Explain Analyze data, default \
+                    - `input_file.txt`')
+parser.add_argument('-o', '--output-file',
+                    default='result.txt',
+                    help='path to output file, default - `result.txt`')
+parser.add_argument('-d', '--debug',
+                    action='store_true',
+                    help='print log to stdout')
+args = parser.parse_args()
+
+debug_mode = args.debug
+
+
+if __name__ == "__main__":
+
+
+    nodes = dict()
+    with open(args.input_file, 'r') as f:
+        source = f.readlines()
+
+    INDEX_TO_LEVEL = generate_index_to_level_dict(source)
+
+    for i in range(len(INDEX_TO_LEVEL)):
+        name, content_raw = source[i].strip().split(':', 1)
+        content = (i, INDEX_TO_LEVEL[i])
+        node = Node(i, name, content_raw)
+        nodes.update({i: node})
+        if i == 0:
+            continue
+        a = find_parent(*content, INDEX_TO_LEVEL)
+        node.add_parent(nodes[a[0]])
+
+    RESULT = ["graph BT\n"]
+    for i, node in nodes.items():
+        RESULT.append(node.mermaid_repr())
+        debug(node.pprint())
+
+    with open(args.output_file, 'w') as f:
+        f.write('\n'.join(RESULT))
+
+    print(f'Result saved to {args.output_file}')