Skip to content

Commit

Permalink
Merge pull request #8 from C2DH/yaroslav-dev
Browse files Browse the repository at this point in the history
Correct and enrich urls / output tagged
  • Loading branch information
eliselavy authored May 2, 2024
2 parents 0634b5c + 8daf1c6 commit a60b4ed
Show file tree
Hide file tree
Showing 10 changed files with 9,079 additions and 186 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/github-actions-example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
id: preflight
uses: c2dh/journal-of-digital-history-ipynb-preflight-action@master
with:
notebook: 'example/articleMaine.ipynb'
notebook: 'example/article_example_checkoutput.ipynb'
functions: 'checkkernel,checkcitation,checkhtml,checkoutput,checktags,checkurls,checkjavascript'
output_md: 'report.md'
# Use the output from the `preflight` step
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,5 @@ dmypy.json
# Pyre type checker
.pyre/
*.code-workspace
example/article2.ipynb
example/article2 copy.json
13 changes: 11 additions & 2 deletions checks/checkjavascript.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,15 @@ def checkjavascript(contents, preview_url):
result_as_md, result_as_stdout = format_output(warnings, preview_url, cell_numbers)

# plotly check begins from here
# check if language is R
metadata = contents.get("metadata", [])
kernel = metadata.get("language_info", [])
# converting to JSON
kernelJSON = json.loads(json.dumps(kernel))
if kernelJSON["name"] == "R":
result_as_md += "> [!NOTE] \n"
result_as_md += f"> **plotly** is not being checked, as programming language used is **R ({kernelJSON['version']})**\n"
return result_as_md, result_as_stdout
result_as_md += "### Check JavaScript (plotly)\n"

requirementsFileExists = os.path.isfile("./requirements.txt")
Expand Down Expand Up @@ -119,8 +128,8 @@ def checkjavascript(contents, preview_url):
},
"plotly.version": {
"found": False,
"not_found_message": "**\* plotly.js** is missing\n",
"found_message": "**\* plotly.js** is present\n",
"not_found_message": "\n**\\* plotly.js** is missing\n",
"found_message": "\n**\\* plotly.js** is present\n",
},
}

Expand Down
281 changes: 245 additions & 36 deletions checks/checkoutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,35 @@
import nbformat
import re

def checkoutput(notebook_json,preview_url):

def checkoutput(notebook_json, preview_url):
result_as_md = "\n### Check Output Sizes and Rules\n"
result_as_stdout = ""

mimetype_object = {
"text/html": {
"present": False,
},
"text/plain": {
"present": False,
},
"image/png": {
"present": False,
},
"audio": {
"present": False,
},
}

output_type_object = {
"display_data": {
"present": False,
},
"execute_result": {
"present": False,
},
}

try:

# Convert JSON to notebook object
Expand All @@ -15,70 +40,254 @@ def checkoutput(notebook_json,preview_url):
total_size_kb = 0
total_images = 0
total_tables = 0
total_audios = 0

# List of desired MIME types
desired_mimetypes = ['text/html', 'image/png', 'application/vnd.plotly.v1+json', 'application/vdom.v1+json', 'application/geo+json']
# List of accepted MIME types, order is from most important to the least important descending
accepted_mimetypes = [
"text/html",
"image/png",
"text/plain",
"display_data",
"execute_result",
"application/vnd.plotly.v1+json",
"application/vdom.v1+json",
"application/geo+json",
]
tag_analysis_table = "\n| Tag | Cell | Remark |\n| --- | --- | --- |\n"

# Define the regex pattern for a <table> tag
table_tag_pattern = r'<table\b[^>]*>'
table_tag_pattern = r"<table\b[^>]*>"

# Define the regex pattern for a <audio> tag
audio_tag_pattern = r"<audio\b[^>]*>"

# Define the tag pattern
allowed_tag_pattern = r'(table|figure)-(\d+|[\w]+-\*)'
allowed_tag_pattern = r"(table|figure|sound)-(\d+|[\w]+-\*)"

for i, cell in enumerate(notebook.cells):
if cell.get("outputs"):
for j, output in enumerate(cell["outputs"]):

# Serialize the output using pickle
serialized_output = pickle.dumps(output)
size = len(serialized_output) / 1024 # Convert from bytes to KB
# Get the input cell source and extract the first few words
input_cell = " ".join(cell["source"]).strip()
first_words = " ".join(input_cell.split()[:5])
# Check if the output MIME type is in the desired list
output_mimetype = next((mimetype for mimetype in desired_mimetypes if mimetype in output.get("data", {})), None)

# Check the output_type

output_type = output.get("output_type")

if (
output_type in list(output_type_object.keys())
and output_type_object[output_type]["present"] == False
):
output_type_object[output_type]["present"] = True

# Check if the output MIME type is in the accepted mimetypes list
output_mimetype = next(
(
mimetype
for mimetype in accepted_mimetypes
if mimetype in output.get("data", {})
),
None,
)

if output_mimetype:
if output_mimetype == 'text/html':
html_content = output["data"].get(output_mimetype, [])
# Join the list of strings into a single string
html_content = "".join(html_content)
# Use regex to search for the presence of a <table> tag
if re.search(table_tag_pattern, html_content):
total_tables += 1
result_as_md += f"- Table found in output of cell {i + 1}\n"
result_as_md += f"> First words of input cell: {first_words}\n"
elif output_mimetype == 'image/png':
total_images += 1
if 'tags' in cell.metadata and cell.metadata['tags']:
cell_tags = cell.metadata['tags']
for tag in cell_tags:
if re.match(allowed_tag_pattern, tag):
result_as_md += f" - Valid tag: {tag} for image output in cell {i + 1}\n"
if not any(re.match(allowed_tag_pattern, tag) for tag in cell_tags):
if preview_url:
cell_preview_url=preview_url + "?idx=" + str(i + 1)
result_as_md += f" - No valid tags found for image output [Check here ]({cell_preview_url})"
match output_mimetype:
case "text/html":
mimetype_object["text/html"]["present"] = True

html_content = output["data"].get(output_mimetype, [])
# Join the list of strings into a single string
html_content = "".join(html_content)

# Use regex to search for the presence of a <table> tag

if re.search(table_tag_pattern, html_content):
total_tables += 1
tag_message = ""
if "tags" not in cell.metadata or not hasattr(
cell.metadata, "tags"
):
tag_message = (
"Tags are not defined for the cell"
)
tag_analysis_table += (
f"| text/html | {i+1} | {tag_message} |\n"
)
else:
cell_tags = cell.metadata["tags"]

if not any(
re.match(allowed_tag_pattern, tag)
for tag in cell_tags
):
tag_message += (
"Table is not tagged properly"
)
else:
tag_message += "Correct tagging (table)"

if tag_message != "":
tag_analysis_table += f"| text/html | {i+1} | {tag_message} |\n"

result_as_md += (
f"- Table found in output of cell {i + 1}\n"
)
result_as_md += f"> First words of input cell: {first_words}\n"

## check the tag of the table

# search for an audio tag
if re.search(audio_tag_pattern, html_content):
mimetype_object["audio"]["present"] = True
total_audios += 1
tag_message = ""
if "tags" not in cell.metadata or not hasattr(
cell.metadata, "tags"
):
tag_message = (
"Tags are not defined for the cell"
)
tag_analysis_table += (
f"| text/html | {i+1} | {tag_message} |\n"
)
else:
cell_tags = cell.metadata["tags"]

if not any(
re.match(allowed_tag_pattern, tag)
for tag in cell_tags
):
tag_message += (
"Audio is not tagged properly"
)
else:
tag_message += "Correct tagging (audio)"

if tag_message != "":
tag_analysis_table += f"| text/html | {i+1} | {tag_message} |\n"

result_as_md += (
f"- Audio found in output of cell {i + 1}\n"
)
result_as_md += f"> First words of input cell: {first_words}\n"

case "image/png":
mimetype_object["image/png"]["present"] = True

total_images += 1

# defining the message for the tag of the cell
tag_message = ""
if "tags" not in cell.metadata or not hasattr(
cell.metadata, "tags"
):
tag_message = "Tags are not defined for the cell"
tag_analysis_table += (
f"| image/png | {i+1} | {tag_message} |\n"
)
else:
cell_tags = cell.metadata["tags"]

if not any(
re.match(allowed_tag_pattern, tag)
for tag in cell_tags
):
if preview_url:
cell_preview_url = (
preview_url + "?idx=" + str(i + 1)
)
tag_message += f"Image is not tagged properly [Check here ]({cell_preview_url})"
else:
tag_message += (
f"Image is not tagged properly"
)
else:
result_as_md += f" - No valid tags found for image output in cell {i + 1}\n"
else:
result_as_md += f" - No tags found for image output in cell {i + 1}\n"
tag_message += "Correct tagging"

if tag_message != "":
tag_analysis_table += (
f"| image/png | {i+1} | {tag_message} |\n"
)

# Print the output size if it's greater than 1MB
case "text/plain":
mimetype_object["text/plain"]["present"] = True

tag_message = "Should be replaced"

tag_analysis_table += (
f"| text/plain | {i+1} | {tag_message} |\n"
)
if size > 1000:
result_as_md += f"- Output cell {i + 1} size: {size:.2f} KB\n"
result_as_md += f"> First words of input cell: {first_words}\n"
result_as_stdout += f"Output cell {i + 1} size: {size:.2f} KB\n"
result_as_stdout += f"First words of input cell: {first_words}\n"
result_as_md += (
f"- Output cell {i + 1} size: {size:.2f} KB\n"
)
result_as_md += (
f"> First words of input cell: {first_words}\n"
)
result_as_stdout += (
f"Output cell {i + 1} size: {size:.2f} KB\n"
)
result_as_stdout += (
f"First words of input cell: {first_words}\n"
)
total_size_kb += size

## show a warning if the notebook is greater than 10mb

result_as_md += f"\nTotal output size: {total_size_kb:.2f} KB\n"
result_as_md += f"Total number of images: {total_images}\n"
result_as_md += f"Total number of tables: {total_tables}"
result_as_md += f"Total number of tables: {total_tables}\n"
result_as_md += f"Total number of audios: {total_tables}\n"

result_as_stdout += f"Total output size: {total_size_kb:.2f} KB\n"
result_as_stdout += f"Total number of images: {total_images}\n"
result_as_stdout += f"Total number of tables: {total_tables}"

size_warning = ""

if total_size_kb > 10240:
size_warning = "\n> [!CAUTION]\n> Notebook's size exceeds 10mb\n"

# Creating a Mimetype table

table_output = "\n| Mimetype | Presence |\n| --- | --- |\n"

mimetype_keys = list(mimetype_object.keys())

for key in mimetype_keys:
if mimetype_object[key]["present"] == True:
table_output += f"| {key} | True |\n"
if key == "text/plain":
# add a warning
text_plain_warning = "\n> [!WARNING]\n> Notebook's output contains **text/plain** which is deprecated.\n"
result_as_md += text_plain_warning
else:
table_output += f"| {key} | False |\n"

# Creating an output_type table

output_type_table = "\n| Output type | Presence |\n| --- | --- |\n"

output_type_keys = list(output_type_object.keys())

for key in output_type_keys:
if output_type_object[key]["present"] == True:
output_type_table += f"| {key} | True |\n"
else:
output_type_table += f"| {key} | False |\n"

# displaying the tables in the right order
result_as_md += size_warning

result_as_md += table_output
result_as_md += output_type_table
result_as_md += tag_analysis_table

return result_as_md, ""

except Exception as e:
Expand Down
Loading

0 comments on commit a60b4ed

Please sign in to comment.