Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
71cb56e
feat: Update spaCy and transformers dependencies for Pydantic v2 comp…
Flyboy990 Oct 4, 2025
3d3cb8e
fix: Update pytorch_transformers import to transformers in opennyai c…
Flyboy990 Oct 4, 2025
9764fde
fix: Update cached_path import in tokenization.py
Flyboy990 Oct 4, 2025
b1d610b
fix: Import cached_path directly from transformers
Flyboy990 Oct 4, 2025
8fb49e2
Accept plain text input for RR preprocessing
Flyboy990 Oct 6, 2025
e1900ba
Fix temp_dir for containerized environments
Flyboy990 Oct 7, 2025
c6ee324
Force HF cache env vars in Bert.__init__ to fix HF Space permissions
Flyboy990 Oct 7, 2025
0654361
Fix Bert class indentation and force HF cache env vars
Flyboy990 Oct 7, 2025
d8d051c
Fix transformers 4.30.0 compatibility - replace cached_path
Flyboy990 Oct 7, 2025
1f17bb4
Fix RhetoricalRole cache path for HF Space permissions
Flyboy990 Oct 7, 2025
423d957
ix cached_path compatibility with transformers 4.x and huggingface_hub
Flyboy990 Oct 7, 2025
509a96b
Fix indentation in initialize() method
Flyboy990 Oct 7, 2025
679d060
Fix strict=False for position_ids compatibility
Flyboy990 Oct 7, 2025
8bf0fdc
Fix tokenizer vocab loading for modern transformersy
Flyboy990 Oct 7, 2025
4cb3cf8
Fix cache directory to use /tmp for HF Spaces
Flyboy990 Oct 7, 2025
bebd154
Fix vocab file creation using get_vocab() method
Flyboy990 Oct 8, 2025
c35ca53
Fix BERT forward call to use keyword arguments
Flyboy990 Oct 8, 2025
ea791d4
Fix indentation in Bert.forward() method
Flyboy990 Oct 8, 2025
34f5b37
ix BERT output extraction for modern transformers
Flyboy990 Oct 8, 2025
537437c
Remove redundant env vars, ensure cache dir exists
Flyboy990 Oct 8, 2025
622317a
Fix Bert class indentation after env removal
Flyboy990 Oct 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 20 additions & 17 deletions opennyai/rhetorical_roles/infer_data_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,39 +34,42 @@ def attach_short_sentence_boundries_to_next(revised_sentence_boundries, doc_txt)

def split_into_sentences_tokenize_write(data, custom_processed_data_path,
hsln_format_txt_dirpath='datasets/pubmed-20k', verbose=False):
########## This function accepts the input files in LS format, creates tokens and writes them with label as "NONE" to text file
########## This function accepts plain text input and processes it for rhetorical role prediction

if not os.path.exists(hsln_format_txt_dirpath):
os.makedirs(hsln_format_txt_dirpath)

# Load spaCy model once
nlp = spacy.load("en_core_web_sm")

max_length = 10000
output_json = []
filename_sent_boundries = {} ###### key is the filename and value is dict containing sentence spans {"abc.txt":{"sentence_span":[(1,10),(11,20),...]} , "pqr.txt":{...},...}
filename_sent_boundries = {}

if verbose:
msg.info('Preprocessing rhetorical role model input!!!')

for data_dict in tqdm(data, disable=not verbose):

doc_id = data_dict['file_id']
preamble_doc = data_dict['preamble_doc']
judgment_doc = data_dict['judgement_doc']

if filename_sent_boundries.get(doc_id) is None: ##### Ignore if the file is already present
text = data_dict['text'] # Plain text string now

nlp_doc = spacy.tokens.Doc.from_docs([preamble_doc, judgment_doc])
if filename_sent_boundries.get(doc_id) is None:
# Process text with spaCy
nlp_doc = nlp(text)
doc_txt = nlp_doc.text

sentence_boundries = [(sent.start_char, sent.end_char) for sent in nlp_doc.sents]
revised_sentence_boundries = attach_short_sentence_boundries_to_next(sentence_boundries, doc_txt)

adjudicated_doc = {'id': doc_id,
'data': {'preamble_text': preamble_doc.text,
'judgement_text': judgment_doc.text,
'text': doc_txt}
}
adjudicated_doc = {
'id': doc_id,
'data': {'text': doc_txt}
}

adjudicated_doc['annotations'] = []
adjudicated_doc['annotations'].append({})
adjudicated_doc['annotations'] = []

filename_sent_boundries[doc_id] = {"sentence_span": []}

for sentence_boundry in revised_sentence_boundries:
sentence_txt = doc_txt[sentence_boundry[0]:sentence_boundry[1]]

Expand All @@ -79,11 +82,11 @@ def split_into_sentences_tokenize_write(data, custom_processed_data_path,
sent_data['labels'] = []
adjudicated_doc['annotations'].append(sent_data)

output_json.append(adjudicated_doc)
output_json.append(adjudicated_doc)

with open(custom_processed_data_path, 'w+') as f:
json.dump(output_json, f)


def write_in_hsln_format(input_json, hsln_format_txt_dirpath, tokenizer):
# tokenizer = BertTokenizer.from_pretrained(BERT_VOCAB, do_lower_case=True)
json_format = json.load(open(input_json))
Expand Down
13 changes: 10 additions & 3 deletions opennyai/rhetorical_roles/rhetorical_roles.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@
from .eval import infer_model
from .models import BertHSLN
from .task import pubmed_task

# Ensure cache directory exists and is writable
import os
if 'OPENNYAI_CACHE_DIR' not in os.environ:
os.environ['OPENNYAI_CACHE_DIR'] = '/tmp/opennyai_temp'
os.makedirs('/tmp/opennyai_temp', exist_ok=True)

class RhetoricalRolePredictor():

Expand Down Expand Up @@ -48,8 +52,11 @@ def initialize(self):
Instantiates Tokenizer for preprocessor to use
Loads labels to name mapping file for post-processing inference response
"""
self.CACHE_DIR = os.path.join(str(Path.home()), '.opennyai')
self.hsln_format_txt_dirpath = os.path.join(self.CACHE_DIR, 'temp_hsln/pubmed-20k', )
# Force writable cache directory for HF Space
self.CACHE_DIR = os.getenv('OPENNYAI_CACHE_DIR', '/tmp/opennyai_temp')
os.makedirs(self.CACHE_DIR, exist_ok=True)

self.hsln_format_txt_dirpath = os.path.join(self.CACHE_DIR, 'temp_hsln/pubmed-20k')
os.makedirs(self.hsln_format_txt_dirpath, exist_ok=True)

if self.use_gpu:
Expand Down
23 changes: 17 additions & 6 deletions opennyai/summarizer/models/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import torch
import torch.nn as nn
from pytorch_transformers import BertModel, BertConfig
from transformers import BertModel, BertConfig
from torch.nn.init import xavier_uniform_

from opennyai.utils.download import CACHE_DIR
Expand Down Expand Up @@ -120,20 +120,31 @@ def get_generator(vocab_size, dec_hidden_size, device):
class Bert(nn.Module):
def __init__(self, large, temp_dir=EXTRACTIVE_SUMMARIZER_CACHE_PATH, finetune=False):
super(Bert, self).__init__()

# Ensure cache directory exists and is writable
os.makedirs(temp_dir, exist_ok=True)

if (large):
self.model = BertModel.from_pretrained('bert-large-uncased', cache_dir=temp_dir)
else:
self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir)

self.finetune = finetune

def forward(self, x, segs, mask):
if (self.finetune):
top_vec, _ = self.model(x, segs, attention_mask=mask)
outputs = self.model(input_ids=x, token_type_ids=segs, attention_mask=mask)
else:
self.eval()
with torch.no_grad():
top_vec, _ = self.model(x, segs, attention_mask=mask)
outputs = self.model(input_ids=x, token_type_ids=segs, attention_mask=mask)

# Extract last_hidden_state from outputs
if hasattr(outputs, 'last_hidden_state'):
top_vec = outputs.last_hidden_state
else:
top_vec = outputs[0]

return top_vec


Expand Down Expand Up @@ -166,7 +177,7 @@ def __init__(self, args, device, checkpoint):
self.bert.model.embeddings.position_embeddings = my_pos_embeddings

if checkpoint is not None:
self.load_state_dict(checkpoint['model'], strict=True)
self.load_state_dict(checkpoint['model'], strict=False)
else:
if args.param_init != 0.0:
for p in self.ext_layer.parameters():
Expand Down
2 changes: 1 addition & 1 deletion opennyai/summarizer/others/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __setargs__():
parser.bert_data_path = EXTRACTIVE_SUMMARIZER_CACHE_PATH
parser.model_path = EXTRACTIVE_SUMMARIZER_CACHE_PATH
parser.result_path = EXTRACTIVE_SUMMARIZER_CACHE_PATH
parser.temp_dir = EXTRACTIVE_SUMMARIZER_CACHE_PATH
parser.temp_dir = '/tmp/huggingface'
parser.batch_size = 5000
parser.test_batch_size = 1
parser.max_pos = 512
Expand Down
66 changes: 35 additions & 31 deletions opennyai/summarizer/others/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import unicodedata
from io import open

from pytorch_transformers import cached_path
from wasabi import msg

from opennyai.utils.download import CACHE_DIR
Expand Down Expand Up @@ -137,38 +136,43 @@ def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=EXTRACTIVE_SUM
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
else:
vocab_file = pretrained_model_name_or_path
if os.path.isdir(vocab_file):
vocab_file = os.path.join(vocab_file, VOCAB_NAME)
# redirect to the cache, if necessary
from transformers import BertTokenizer as ModernBertTokenizer

try:
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
except EnvironmentError:
msg.fail(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
vocab_file))
# Let transformers handle the download and caching
modern_tokenizer = ModernBertTokenizer.from_pretrained(
pretrained_model_name_or_path,
do_lower_case=kwargs.get('do_lower_case', True),
cache_dir=cache_dir
)

# Create vocab file from the modern tokenizer's vocab
os.makedirs(cache_dir, exist_ok=True)
vocab_file = os.path.join(cache_dir, f'{pretrained_model_name_or_path.replace("/", "_")}_vocab.txt')

# Write vocab to file if it doesn't exist
if not os.path.isfile(vocab_file):
with open(vocab_file, 'w', encoding='utf-8') as f:
# Get vocab dict and sort by token id
vocab_dict = modern_tokenizer.get_vocab()
sorted_vocab = sorted(vocab_dict.items(), key=lambda x: x[1])
for token, idx in sorted_vocab:
f.write(token + '\n')

# Set max_len if specified
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)

# Instantiate our custom tokenizer with the vocab file
tokenizer = cls(vocab_file, *inputs, **kwargs)
return tokenizer

except Exception as e:
import traceback
traceback.print_exc()
msg.fail(f"Could not load tokenizer '{pretrained_model_name_or_path}': {e}")
return None
# if resolved_vocab_file == vocab_file:
# msg.info("loading vocabulary file {}".format(vocab_file))
# else:
# msg.info("loading vocabulary file {} from cache at {}".format(
# vocab_file, resolved_vocab_file))
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
# if we're using a pretrained model, ensure the tokenizer won't index sequences longer
# than the number of positional embeddings
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
# Instantiate tokenizer.
tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
return tokenizer


class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
Expand Down
51 changes: 41 additions & 10 deletions opennyai/utils/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,38 @@
import subprocess
import sys
from pathlib import Path

import torch
from huggingface_hub import hf_hub_download

"""Functions for downloading opennyai ner models."""

PIP_INSTALLER_URLS = {
"en_legal_ner_trf": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/en_legal_ner_trf-any-py3-none-any.whl",
"en_legal_ner_sm": "https://huggingface.co/opennyaiorg/en_legal_ner_sm/resolve/main/en_legal_ner_sm-any-py3-none-any.whl",
"en_core_web_md": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/STOCK_SPACY_MODELS/en_core_web_md-3.2.0-py3-none-any.whl",
"en_core_web_sm": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/STOCK_SPACY_MODELS/en_core_web_sm-3.2.0-py3-none-any.whl",
"en_core_web_trf": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/STOCK_SPACY_MODELS/en_core_web_trf-3.2.0-py3-none-any.whl"}
"en_core_web_trf": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/STOCK_SPACY_MODELS/en_core_web_trf-3.2.0-py3-none-any.whl"
}

TORCH_PT_MODEL_URLS = {
"RhetoricalRole": "https://huggingface.co/opennyaiorg/InRhetoricalRoles/resolve/main/InRhetoricalRoleModel.pt",
"ExtractiveSummarizer": "https://huggingface.co/opennyaiorg/InExtractiveSummarizer/resolve/main/InExtractiveSummarizerModel.pt"
}
CACHE_DIR = os.path.join(str(Path.home()), '.opennyai')

# Model repo info for new hf_hub_download API
HF_MODEL_REPOS = {
"RhetoricalRole": {
"repo_id": "opennyaiorg/InRhetoricalRoles",
"filename": "InRhetoricalRoleModel.pt"
},
"ExtractiveSummarizer": {
"repo_id": "opennyaiorg/InExtractiveSummarizer",
"filename": "InExtractiveSummarizerModel.pt"
}
}

# Use /tmp for HuggingFace Spaces (writable), fallback to home directory
CACHE_DIR = os.getenv('OPENNYAI_CACHE_DIR', os.path.join('/tmp', 'opennyai'))


def install(package: str):
Expand All @@ -26,7 +43,8 @@ def install(package: str):
package (string): wheel file url
"""
subprocess.check_call(
[sys.executable, "-m", "pip", "install", package, "--no-deps"], stdout=subprocess.DEVNULL
[sys.executable, "-m", "pip", "install", package, "--no-deps"],
stdout=subprocess.DEVNULL
)


Expand All @@ -36,10 +54,23 @@ def load_model_from_cache(model_name: str):
Args:
model_name (string): model name to download and save
"""
if TORCH_PT_MODEL_URLS.get(model_name) is None:
if model_name not in HF_MODEL_REPOS:
raise RuntimeError(f'{model_name} is not supported by opennyai, please check the name!')
else:
model_url = TORCH_PT_MODEL_URLS[model_name]
os.makedirs(os.path.join(CACHE_DIR, model_name.lower()), exist_ok=True)
return torch.hub.load_state_dict_from_url(model_url, model_dir=os.path.join(CACHE_DIR, model_name.lower()),
check_hash=True, map_location=torch.device('cpu'))

model_info = HF_MODEL_REPOS[model_name]
cache_dir = os.path.join(CACHE_DIR, model_name.lower())
os.makedirs(cache_dir, exist_ok=True)

# Download using new huggingface_hub API
print(f'Downloading: "{model_info["repo_id"]}/{model_info["filename"]}" to {cache_dir}')

model_path = hf_hub_download(
repo_id=model_info["repo_id"],
filename=model_info["filename"],
cache_dir=cache_dir
)

# Load the model
state_dict = torch.load(model_path, map_location=torch.device('cpu'))

return state_dict
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.8 || ^3.9 || ^3.10"

spacy = ">=3.2.2,<3.3.0"
spacy = ">=3.7.0,<3.8.0" # CHANGED: Updated for Pydantic v2 compatibility
pandas = ">1.2.4"
beautifulsoup4 = ">=4.10.0"
torch = ">=1.12.1"
levenshtein = "^0.23.0"
multiprocess = "^0.70.15"
spacy-transformers = ">=1.1.4"
spacy-transformers = ">=1.3.0,<1.4.0" # CHANGED: Updated for spaCy 3.7.x compatibility
scikit-learn = "^1.3.2"
pytest = "^7.4.3"
prettytable = ">=3.1.1"
nltk = ">=3.6"
pytorch-transformers = "^1.2.0"
transformers = ">=4.30.0,<4.40.0" # CHANGED: Replaced 'pytorch-transformers' with 'transformers'

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
build-backend = "poetry.core.masonry.api"