-
Notifications
You must be signed in to change notification settings - Fork 0
/
technical_review.py
192 lines (155 loc) · 5.98 KB
/
technical_review.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# parse the cells tag and add "-*" at the end of a tag beginning with "figure"
import os
import re
import sys
import nbformat
from os.path import basename, splitext
from datetime import datetime
import requests
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
#call a .sh script
def call_script(script_name):
os.system("sh {}".format(script_name))
def extract_html(text):
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
urls = pattern.findall(text)
return urls
def extract_url_md_html(text):
pattern = re.compile(r'(?P<url>https?://[^\s]+)|\[(?P<label>[^\]]+)\]\((?P<url2>https?://[^\s]+)\)')
urls = []
for match in pattern.finditer(text):
if match.group('url2'):
urls.append(match.group('url2'))
elif match.group('url2'):
urls.append(match.group('url2'))
return urls
def is_valid_url(url):
"""
Checks if a URL is valid by sending a HEAD request to the URL and checking the response status code.
Args:
url (str): The URL to check.
Returns:
True if the URL is valid (i.e. returns a 2xx or 3xx status code), False otherwise.
"""
try:
response = requests.head(url)
return response.status_code in range(200, 400)
except requests.exceptions.RequestException:
return False
def check_urls(notebook):
for cell in notebook.cells:
urls = extract_url_md_html(cell.source)
if urls:
for url in urls:
# Remove a closing parentheses or square bracket from the end of the URL, if present
if url[-1] == ')' or url[-1] == ']':
url = url[:-1]
valid = is_valid_url(url)
print("URL: {} is valid: {}".format(url, valid))
def check_ressource_in_dir_exclude(notebook, dir_path):
"""
Recursively gets all files in a directory tree, excluding files in the `.git` and `.ipynb_checkpoints` directories.
Look for if it's mentionned in the notebook
Args:
dir_path (str): Path to the directory to search.
"""
for root, subdirs, filenames in os.walk(dir_path):
# Exclude files in the `.git` directory
if '.git' in root.split(os.sep):
continue
# Exclude files in the `.ipynb_checkpoints` directory
if '.ipynb_checkpoints' in root.split(os.sep):
continue
for filename in filenames:
file_path = os.path.join(root, filename)
nbfind = 0
for cell in notebook.cells:
# if not in the cell.source, print the file name
if file_path in cell.source:
nbfind += 1
if nbfind==0:
# write in the log file
print(" file {} is not called".format(file_path))
def check_notebook_executed(nb):
# Create an ExecutePreprocessor to execute the notebook cells
ep = ExecutePreprocessor(timeout=None)
# Iterate over the cells in the notebook
for cell in nb.cells:
# Only check code cells
if cell.cell_type == 'code':
# Execute the cell using the ExecutePreprocessor
try:
ep.preprocess_cell(cell, {'metadata': {'path': '.'}})
except Exception as e:
print(f"Error executing cell {cell.execution_count}: {e}")
return False
# Check if the cell has output
if len(cell.outputs) == 0:
print(f"Cell {cell.execution_count} has no output")
return False
# If all cells were executed and had output, return True
return True
def extract_html(text):
html_pattern = r'<(?!\/?(cite|\/cite|cite\s)).*?>'
matches = re.findall(html_pattern, text)
if len(matches) > 0:
print("found in the cell: {}".format(text))
#for i, match in enumerate(matches):
# print(f'Match {i+1}:\n{match}')
def check_html(notebook):
for cell in notebook.cells:
urls = extract_html(cell.source)
if urls:
for url in urls:
# Remove a closing parentheses or square bracket from the end of the URL, if present
if url[-1] == ')' or url[-1] == ']':
url = url[:-1]
valid = is_valid_url(url)
print("URL: {} is valid: {}".format(url, valid))
def add_figure_tag(notebook):
for cell in notebook.cells:
tags = cell.get("metadata", {}).get("tags", [])
if tags:
for tag in tags:
if tag.startswith("figure"):
print("add -* to the tag: {}".format(tag))
tags.append(tag + "-*")
tags.remove(tag)
break
return notebook
# main function
def main():
call_script("generate_requirements.sh")
# take the first argument passed to the script
notebook_path = sys.argv[1]
# logFile based output_timestamp.txt
logFile = "output_{}.txt".format(str(datetime.now()))
# create a log file
sys.stdout = open(logFile, "w")
#write in the log file
sys.stdout.write("notebook_path: {}\n".format(notebook_path))
# open the notebook
notebook = nbformat.read(notebook_path, as_version=4)
#get the list of the folder of the repository
folder_list = os.listdir()
#for each folder in the list
sys.stdout.write("CHECK RESSOURCES\n")
for folder in folder_list:
#call the check_ressources method
check_ressource_in_dir_exclude(notebook,folder)
sys.stdout.write("CHECK URL\n")
check_urls(notebook)
sys.stdout.write("CHECK HTML\n")
check_html(notebook)
#sys.stdout.write("CHECK EXECUTED CELL\n")
#check_notebook_executed(notebook)
#close the log file
sys.stdout.close()
#call the main function
if __name__ == '__main__':
main()
#call the script from the terminal
#!python technical_review.py paper.ipynb
#call the script from the notebook
#%run technical_review.py paper.ipynb