Skip to content

Commit 1cc3d07

Browse files
committed
main: added script, helpers and README
This commit adds first version of the report2mermaid script.
1 parent 05fec45 commit 1cc3d07

File tree

6 files changed

+249
-0
lines changed

6 files changed

+249
-0
lines changed

.gitignore

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Input and output files
2+
input_file.txt
3+
result.txt
4+
5+
# Byte-compiled
6+
helpers/__pycache__/
7+
__pycache__/
8+
9+
# Environment
10+
venv/

README.md

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# DataFusion Explain output transformer to Mermaid format
2+
3+
This script transforms DataFusion Explain Analyze output to Mermaid readable text format for simple graph generation.
4+
5+
## Usage
6+
7+
```
8+
python3 main.py [-h] [-f INPUT_FILE] [-o OUTPUT_FILE] [-d]
9+
```
10+
11+
Convert DataFusion Explain Analyze to Mermaid
12+
13+
options:
14+
-h, --help show this help message and exit
15+
-f INPUT_FILE, --input-file INPUT_FILE path to file with Explain Analyze data, default - `input_file.txt`
16+
-o OUTPUT_FILE, --output-file OUTPUT_FILE path to output file, default - `result.txt`
17+
-d, --debug print log to stdout
18+
19+
## Sample input
20+
21+
```
22+
CoalescePartitionsExec, metrics=[]
23+
ProjectionExec: expr=[SUM(table.x)@1 as SUM(x)], metrics=[]
24+
HashAggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[SUM(x)], metrics=[outputRows=2]
25+
CoalesceBatchesExec: target_batch_size=4096, metrics=[]
26+
RepartitionExec: partitioning=Hash([Column { name: "b", index: 0 }], 16), metrics=[sendTime=839560, fetchTime=122528525, repartitionTime=5327877]
27+
HashAggregateExec: mode=Partial, gby=[b@1 as b], aggr=[SUM(x)], metrics=[outputRows=2]
28+
RepartitionExec: partitioning=RoundRobinBatch(16), metrics=[fetchTime=5660489, repartitionTime=0, sendTime=8012]
29+
CsvExec: file_groups={1 group: [[/tmp/table.csv]]}, has_header=false, metrics=[]
30+
```
31+
32+
## Sample output
33+
34+
```text
35+
graph BT
36+
37+
0[<b>CoalescePartitionsExec</b>
38+
]
39+
1[<b>ProjectionExec</b>
40+
] --> 0
41+
2[<b>HashAggregateExec</b>
42+
outputRows=2] --> 1
43+
3[<b>CoalesceBatchesExec</b>
44+
] --> 2
45+
4[<b>RepartitionExec</b>
46+
sendTime=839560\n fetchTime=122528525\n repartitionTime=5327877] --> 3
47+
5[<b>HashAggregateExec</b>
48+
outputRows=2] --> 4
49+
6[<b>RepartitionExec</b>
50+
fetchTime=5660489\n repartitionTime=0\n sendTime=8012] --> 5
51+
7[<b>CsvExec</b>
52+
] --> 6
53+
```
54+
55+
## Sample diagram
56+
```mermaid
57+
graph BT
58+
59+
0[<b>CoalescePartitionsExec</b>
60+
]
61+
1[<b>ProjectionExec</b>
62+
] --> 0
63+
2[<b>HashAggregateExec</b>
64+
outputRows=2] --> 1
65+
3[<b>CoalesceBatchesExec</b>
66+
] --> 2
67+
4[<b>RepartitionExec</b>
68+
sendTime=839560\n fetchTime=122528525\n repartitionTime=5327877] --> 3
69+
5[<b>HashAggregateExec</b>
70+
outputRows=2] --> 4
71+
6[<b>RepartitionExec</b>
72+
fetchTime=5660489\n repartitionTime=0\n sendTime=8012] --> 5
73+
7[<b>CsvExec</b>
74+
] --> 6
75+
```

helpers/__init__.py

Whitespace-only changes.

helpers/helpers.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# fill the dict with {index: indents_count} for indents show the level in graph
2+
from main import debug_mode
3+
4+
def generate_index_to_level_dict(source):
5+
result = dict()
6+
for i in range(len(source)):
7+
if not len(source[i].strip()):
8+
continue
9+
indents = len(source[i]) - len(source[i].lstrip())
10+
result.update({i: indents})
11+
return(result)
12+
13+
14+
def find_parent(index, indents, INDEX_TO_LEVEL):
15+
index_to_level_items = INDEX_TO_LEVEL.items()
16+
b = list(filter(lambda x: x[0] < index and x[1] == indents - 2,
17+
index_to_level_items))
18+
if len(b) == 1:
19+
return(b[0])
20+
if len(b) > 1:
21+
return(sorted(list(b), key=lambda x: x[1])[-1])
22+
23+
24+
def debug(line):
25+
if not debug_mode:
26+
pass
27+
else:
28+
print(line)

helpers/node.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import json
2+
3+
4+
class Node():
5+
def __init__(self, index, name, content, parent=None):
6+
self.index = index
7+
self.name = name
8+
self.content = content
9+
self.parent = parent
10+
self.childrens = list()
11+
self.metrics = self.content.split('metrics=')[-1].\
12+
lstrip('[').rstrip(']').replace(',', '\\n')
13+
14+
def __repr__(self):
15+
return(str(self.name))
16+
17+
def content_to_json(self):
18+
# replace non-JSON symbols to JSON symbols, wrap strings to quotes
19+
symbols_to_replace = {
20+
', ': '", "',
21+
'=': '":"',
22+
'[': '"{"',
23+
']': '"}"',
24+
'""': '',
25+
'}"}': '}}'
26+
}
27+
self.content_processor = '{"' + self.content.strip() + '}'
28+
for s, d in symbols_to_replace.items():
29+
self.content_processor = self.content_processor.replace(s, d)
30+
31+
# find all the cases of `{ no colon text }` and replace parents
32+
# don't start from the zero index to avoid starting JSON replace
33+
def replace_non_dict(start=1):
34+
dict_start = self.content_processor.find('{', start)
35+
dict_end = self.content_processor.find('}', dict_start)
36+
colon = self.content_processor.find(':', dict_start, dict_end)
37+
if colon == -1:
38+
self.content_processor = self.content_processor[:dict_start] +\
39+
'[' + \
40+
self.content_processor[dict_start + 1:dict_end] + \
41+
']' + \
42+
self.content_processor[dict_end + 1:]
43+
print(self.content_processor)
44+
if not dict_start == self.content_processor.rfind('{', start):
45+
replace_non_dict(start=dict_end)
46+
47+
replace_non_dict()
48+
try:
49+
res = json.loads(self.content_processor)
50+
except Exception as e:
51+
res = f"""
52+
>>>>>>>>>>>>>>>>ERROR
53+
{self.content_processor}
54+
>>>>>>>>>>>>>>>>{e}
55+
"""
56+
# res = self.content_processor
57+
58+
print(res)
59+
60+
def add_parent(self, parent):
61+
self.parent = parent
62+
parent.childrens.append(self)
63+
64+
def mermaid_repr(self):
65+
res = f"{self.index}[<b>{self.name}</b>\n{self.metrics}]"
66+
if self.parent:
67+
res += f" --> {self.parent.index}"
68+
return(res)
69+
70+
def childrens_repr(self):
71+
if not self.childrens:
72+
return
73+
return([children.index for children in self.childrens])
74+
75+
def pprint(self):
76+
node_description = f"""
77+
NAME: {self.name}
78+
CONTENT: {self.content}
79+
PARENT: {self.parent.index if self.parent else None}
80+
CHILDRENS: {self.childrens_repr()}
81+
------------------------------------------------\n
82+
"""
83+
return(node_description)

main.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/usr/bin/python3
2+
import argparse
3+
4+
from helpers.helpers import *
5+
from helpers.node import Node
6+
7+
8+
parser = argparse.ArgumentParser(description=
9+
'Convert DataFusion Explain Analyze \
10+
to Mermaid')
11+
parser.add_argument('-f', '--input-file',
12+
default='input_file.txt',
13+
help='path to file with Explain Analyze data, default \
14+
- `input_file.txt`')
15+
parser.add_argument('-o', '--output-file',
16+
default='result.txt',
17+
help='path to output file, default - `result.txt`')
18+
parser.add_argument('-d', '--debug',
19+
action='store_true',
20+
help='print log to stdout')
21+
args = parser.parse_args()
22+
23+
debug_mode = args.debug
24+
25+
26+
if __name__ == "__main__":
27+
28+
29+
nodes = dict()
30+
with open(args.input_file, 'r') as f:
31+
source = f.readlines()
32+
33+
INDEX_TO_LEVEL = generate_index_to_level_dict(source)
34+
35+
for i in range(len(INDEX_TO_LEVEL)):
36+
name, content_raw = source[i].strip().split(':', 1)
37+
content = (i, INDEX_TO_LEVEL[i])
38+
node = Node(i, name, content_raw)
39+
nodes.update({i: node})
40+
if i == 0:
41+
continue
42+
a = find_parent(*content, INDEX_TO_LEVEL)
43+
node.add_parent(nodes[a[0]])
44+
45+
RESULT = ["graph BT\n"]
46+
for i, node in nodes.items():
47+
RESULT.append(node.mermaid_repr())
48+
debug(node.pprint())
49+
50+
with open(args.output_file, 'w') as f:
51+
f.write('\n'.join(RESULT))
52+
53+
print(f'Result saved to {args.output_file}')

0 commit comments

Comments
 (0)