-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathhugo_util_alt_text_generator.py
169 lines (128 loc) · 6.18 KB
/
hugo_util_alt_text_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import os
import re
import traceback
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
region = os.environ['ACCOUNT_REGION']
key = os.environ['ACCOUNT_KEY']
SITE_PATH = os.environ['SITE_PATH']
if not region or not key:
raise EnvironmentError("ACCOUNT_REGION and/or ACCOUNT_KEY are not set in the environment")
credentials = CognitiveServicesCredentials(key)
client = ComputerVisionClient(
endpoint=f"https://{region}.api.cognitive.microsoft.com/",
credentials=credentials
)
def get_image_path(relative_path, md_file_path):
md_dir = os.path.dirname(md_file_path)
possible_paths = [
os.path.join(md_dir, relative_path),
os.path.join(SITE_PATH, relative_path.lstrip('/')),
os.path.join(SITE_PATH, 'static', relative_path.lstrip('/')),
]
for path in possible_paths:
if os.path.exists(path):
return path
print(f"Image not found in any of these locations:")
for path in possible_paths:
print(f" - {path}")
return None
def get_image_description(image_path, md_file_path):
try:
full_image_path = get_image_path(image_path, md_file_path)
if not full_image_path:
print(f"Image not found: {image_path}")
return None
with open(full_image_path, 'rb') as image_data:
analysis = client.describe_image_in_stream(image_data)
if analysis.captions:
for caption in analysis.captions:
print(f"Caption: {caption.text}, Confidence: {caption.confidence}")
description = analysis.captions[0].text
return description
print(f"No description found for {full_image_path}")
except Exception as e:
print(f"Failed to get description for {image_path}: {e}")
traceback.print_exc()
return None
def update_markdown_content(content, image_descriptions):
def replace_markdown_image(match):
full_match = match.group(0)
alt_text = match.group(1)
image_path = match.group(2)
if alt_text: # If alt text is already present, don't change it
return full_match
if image_path in image_descriptions:
new_alt_text = image_descriptions[image_path]
return f'![{new_alt_text}]({image_path})'
return full_match
def replace_html_image(match):
full_match = match.group(0)
image_path = match.group(1)
if 'alt=' in full_match: # If alt attribute is already present, don't change it
return full_match
if image_path in image_descriptions:
new_alt_text = image_descriptions[image_path]
return f'<image src="{image_path}" alt="{new_alt_text}">'
return full_match
def replace_hugo_shortcode(match):
full_match = match.group(0)
image_path = match.group(1)
if 'alt=' in full_match: # If alt attribute is already present, don't change it
return full_match
if image_path in image_descriptions:
new_alt_text = image_descriptions[image_path]
return f'{{{{< img src="{image_path}" alt="{new_alt_text}" >}}}}'
return full_match
# Regular expressions
markdown_image_pattern = r'!\[(.*?)\]\((.*?)\)'
html_image_pattern = r'<image\s+src="([^"]+)"[^>]*>'
hugo_shortcode_pattern = r'{{<\s*img\s+src="([^"]+)"[^>]*>}}'
# Replace image references
updated_content = re.sub(markdown_image_pattern, replace_markdown_image, content)
updated_content = re.sub(html_image_pattern, replace_html_image, updated_content)
updated_content = re.sub(hugo_shortcode_pattern, replace_hugo_shortcode, updated_content)
return updated_content
def process_markdown_file(md_file_path):
try:
with open(md_file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Find all image references in the markdown content
markdown_image_pattern = r'!\[(.*?)\]\((.*?)\)'
html_image_pattern = r'<image\s+src="([^"]+)"[^>]*>'
hugo_shortcode_pattern = r'{{<\s*img\s+src="([^"]+)"[^>]*>}}'
markdown_matches = re.findall(markdown_image_pattern, content)
html_matches = re.findall(html_image_pattern, content)
hugo_matches = re.findall(hugo_shortcode_pattern, content)
image_descriptions = {}
for alt_text, image_path in markdown_matches:
if not alt_text: # Only process images without alt text
description = get_image_description(image_path, md_file_path)
if description:
image_descriptions[image_path] = description
for image_path in html_matches:
if f'alt="{image_path}"' not in content: # Only process images without alt attribute
description = get_image_description(image_path, md_file_path)
if description:
image_descriptions[image_path] = description
for image_path in hugo_matches:
if f'alt=' not in content: # Only process images without alt attribute
description = get_image_description(image_path, md_file_path)
if description:
image_descriptions[image_path] = description
if image_descriptions:
updated_content = update_markdown_content(content, image_descriptions)
with open(md_file_path, 'w', encoding='utf-8') as file:
file.write(updated_content)
print(f"Updated markdown file: {md_file_path}")
else:
print(f"No updates needed for: {md_file_path}")
except Exception as e:
print(f"Failed to process markdown file {md_file_path}: {e}")
traceback.print_exc()
for root, dirs, files in os.walk(SITE_PATH):
for file_name in files:
if file_name.endswith(".md"):
md_file_path = os.path.join(root, file_name)
process_markdown_file(md_file_path)