Word Document Element preservation (I know this is not related to PDF's, but there are no other resources for help) #4465
Unanswered
Prasaderp
asked this question in
Looking for help
Replies: 0 comments
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
-
Hi,
I know that this issue is not related to PDF's, but there are no other resources for help.
So, currently I am working on translating word document from English to Indic languages which includes many element like graphs, Images, tables etc.
I was using python-docx and xlmx tree libraries but none of them are able to preserve IMAGES as elements.
see I have attached a screenshot for the same below.
`import os
import re
import torch
import gc
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from docx import Document
import time
import subprocess
import shutil
Configuration
MODEL_NAME = "facebook/nllb-200-distilled-1.3B"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LANGUAGES = {
"Hindi": {"code": "hin_Deva", "iso": "hi"},
"Tamil": {"code": "tam_Taml", "iso": "ta"},
"Telugu": {"code": "tel_Telu", "iso": "te"}
}
MEMORY_THRESHOLD = 0.7 # Lowered to trigger memory reset earlier
MAX_LENGTH_DEFAULT = 256
MAX_TOKENS_PER_BLOCK = 200
Initialize the translation model and tokenizer
def initialize_model():
print("Initializing translation model...")
start = time.time()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, src_lang="eng_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
).to(DEVICE).eval()
print(f"Model loaded in {time.time() - start:.2f}s")
return tokenizer, model
tokenizer, model = initialize_model()
Utility Functions
def parse_user_entities(user_input):
entities = [e.strip() for e in user_input.split(',') if e.strip()]
print(f" Entities to preserve: {', '.join(entities) if entities else 'None'}")
return sorted(set(entities), key=len, reverse=True)
def chunk_text_blocks(text, max_tokens=MAX_TOKENS_PER_BLOCK):
"""Split text into blocks of approximately max_tokens, respecting sentence boundaries"""
sentence_boundaries = [m.end() for m in re.finditer(r'[.!?]', text)]
chunks = []
last_end = 0
current_chunk = ""
current_tokens = 0
def replace_with_placeholders(text, entities):
placeholder_map = {}
modified_text = text
def needs_translation(modified_text):
cleaned = re.sub(r'PRESERVE\d{3}', '', modified_text)
return bool(re.search(r'[a-zA-Z]', cleaned))
def restore_entities(text, placeholder_map, original_text):
restored_text = text
restored_entities = []
for placeholder, original in placeholder_map.items():
if placeholder in restored_text:
restored_text = restored_text.replace(placeholder, original, 1)
restored_entities.append(original)
print(f" Restored '{original}' at placeholder '{placeholder}'")
else:
original_pos = original_text.find(original)
if original_pos != -1 and original not in restored_text:
ratio = len(restored_text) / len(original_text) if len(original_text) > 0 else 1
approx_pos = int(original_pos * ratio)
restored_text = restored_text[:approx_pos] + original + restored_text[approx_pos:]
restored_entities.append(original)
print(f" Placeholder '{placeholder}' not found; inserted '{original}' at estimated position")
elif original not in restored_text:
restored_text = f"{original} {restored_text}"
restored_entities.append(original)
print(f" Placeholder '{placeholder}' not found; prepended '{original}'")
def split_text_by_proportions(text, runs, original_text, placeholder_map):
if not runs or not text.strip():
return [""] * len(runs)
def get_dynamic_batch_size(num_texts, fast_mode=False):
if DEVICE != "cuda":
return min(16, num_texts)
total_memory = torch.cuda.get_device_properties(0).total_memory
free_memory = total_memory - torch.cuda.memory_allocated()
tokens_per_text = MAX_LENGTH_DEFAULT
bytes_per_text = tokens_per_text * 4
max_batch = max(1, min(free_memory // bytes_per_text, num_texts))
return min(64 if fast_mode else 16, max_batch)
def translate_batch(texts, target_lang="Hindi", fast_mode=False):⚠️ Memory error: {e}. Reducing batch size and retrying...")
if not texts:
return []
batch_size = get_dynamic_batch_size(len(texts), fast_mode)
translated_texts = []
target_lang_code = LANGUAGES[target_lang]["code"]
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
max_length = max(MAX_LENGTH_DEFAULT, max(len(t.split()) for t in batch) * 2)
try:
inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(DEVICE)
with torch.no_grad():
outputs = model.generate(
inputs,
forced_bos_token_id=tokenizer.convert_tokens_to_ids(target_lang_code),
max_length=max_length,
num_beams=3,
use_cache=True,
early_stopping=True
)
translated = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
translated_texts.extend([re.sub(r'^.+|\s.+$|^\s…', '', t.strip()) for t in translated])
del inputs, outputs
if DEVICE == "cuda":
torch.cuda.empty_cache()
gc.collect()
except RuntimeError as e:
print(f"
if batch_size > 1:
batch_size = max(1, batch_size // 2)
translated_texts.extend(translate_batch(batch, target_lang, fast_mode))
else:
raise
return translated_texts
def reset_gpu_memory():
global model, tokenizer
if DEVICE == "cuda":
del model
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
gc.collect()
print(" Refreshing GPU memory...")
start = time.time()
tokenizer, model = initialize_model()
print(f" GPU memory refreshed in {time.time()-start:.2f}s")
def check_memory_and_reset(total_segments):
if DEVICE != "cuda" or total_segments <= 100:
return False
total_memory = torch.cuda.get_device_properties(0).total_memory
allocated_memory = torch.cuda.memory_allocated()
if allocated_memory / total_memory > MEMORY_THRESHOLD:
reset_gpu_memory()
return True
return False
Document Processing Functions
def collect_texts(doc, entities):
texts = []
# Body paragraphs
for para_idx, para in enumerate(doc.paragraphs):
if para.text.strip():
modified_text, placeholder_map = replace_with_placeholders(para.text, entities)
needs_trans = needs_translation(modified_text)
texts.append(("body", para_idx, para, para.text, modified_text, placeholder_map, needs_trans))
def process_document(input_path, output_path, entities, target_lang="Hindi"):
# Load the original document
doc = Document(input_path)
texts = collect_texts(doc, entities)
if not texts:
print("No translatable text found in the document.")
doc.save(output_path)
return
def assign_translated_text(para, translated_text, original_text, placeholder_map):
runs = [run for run in para.runs]
if not runs:
if para.text.strip():
run = para.add_run(translated_text)
# Preserve basic formatting from the first run if it exists
if para.runs and len(para.runs) > 1:
run.bold = para.runs[0].bold
run.italic = para.runs[0].italic
run.underline = para.runs[0].underline
return
Conversion Function
def convert_doc_to_docx(doc_path):
if not doc_path.endswith('.doc'):
return doc_path
Main Execution
if name == "main":
doc_path = "/content/AIMT2002101_English_Quant.docx"
output_path = "/content/translated_output.docx"
Beta Was this translation helpful? Give feedback.
All reactions