OpenNyAI · Flyboy990 · Oct 4, 2025 · Oct 4, 2025 · Oct 4, 2025 · Oct 4, 2025
diff --git a/opennyai/rhetorical_roles/infer_data_prep.py b/opennyai/rhetorical_roles/infer_data_prep.py
@@ -34,39 +34,42 @@ def attach_short_sentence_boundries_to_next(revised_sentence_boundries, doc_txt)
 
 def split_into_sentences_tokenize_write(data, custom_processed_data_path,
                                         hsln_format_txt_dirpath='datasets/pubmed-20k', verbose=False):
-    ########## This function accepts the input files in LS format, creates tokens and writes them with label as "NONE" to text file
+    ########## This function accepts plain text input and processes it for rhetorical role prediction
 
     if not os.path.exists(hsln_format_txt_dirpath):
         os.makedirs(hsln_format_txt_dirpath)
+
+    # Load spaCy model once
+    nlp = spacy.load("en_core_web_sm")
+
     max_length = 10000
     output_json = []
-    filename_sent_boundries = {}  ###### key is the filename and value is dict containing sentence spans {"abc.txt":{"sentence_span":[(1,10),(11,20),...]} , "pqr.txt":{...},...}
+    filename_sent_boundries = {}
+
     if verbose:
         msg.info('Preprocessing rhetorical role model input!!!')
+
     for data_dict in tqdm(data, disable=not verbose):
-
         doc_id = data_dict['file_id']
-        preamble_doc = data_dict['preamble_doc']
-        judgment_doc = data_dict['judgement_doc']
-
-        if filename_sent_boundries.get(doc_id) is None:  ##### Ignore if the file is already present
+        text = data_dict['text']  # Plain text string now
 
-            nlp_doc = spacy.tokens.Doc.from_docs([preamble_doc, judgment_doc])
+        if filename_sent_boundries.get(doc_id) is None:
+            # Process text with spaCy
+            nlp_doc = nlp(text)
             doc_txt = nlp_doc.text
+
             sentence_boundries = [(sent.start_char, sent.end_char) for sent in nlp_doc.sents]
             revised_sentence_boundries = attach_short_sentence_boundries_to_next(sentence_boundries, doc_txt)
 
-            adjudicated_doc = {'id': doc_id,
-                               'data': {'preamble_text': preamble_doc.text,
-                                        'judgement_text': judgment_doc.text,
-                                        'text': doc_txt}
-                               }
+            adjudicated_doc = {
+                'id': doc_id,
+                'data': {'text': doc_txt}
+            }
 
             adjudicated_doc['annotations'] = []
-            adjudicated_doc['annotations'].append({})
-            adjudicated_doc['annotations'] = []
 
             filename_sent_boundries[doc_id] = {"sentence_span": []}
+
             for sentence_boundry in revised_sentence_boundries:
                 sentence_txt = doc_txt[sentence_boundry[0]:sentence_boundry[1]]
 
@@ -79,11 +82,11 @@ def split_into_sentences_tokenize_write(data, custom_processed_data_path,
                     sent_data['labels'] = []
                     adjudicated_doc['annotations'].append(sent_data)
 
-        output_json.append(adjudicated_doc)
+            output_json.append(adjudicated_doc)
+
     with open(custom_processed_data_path, 'w+') as f:
         json.dump(output_json, f)
 
-
 def write_in_hsln_format(input_json, hsln_format_txt_dirpath, tokenizer):
     # tokenizer = BertTokenizer.from_pretrained(BERT_VOCAB, do_lower_case=True)
     json_format = json.load(open(input_json))

diff --git a/opennyai/rhetorical_roles/rhetorical_roles.py b/opennyai/rhetorical_roles/rhetorical_roles.py
@@ -14,7 +14,11 @@
 from .eval import infer_model
 from .models import BertHSLN
 from .task import pubmed_task
-
+# Ensure cache directory exists and is writable
+import os
+if 'OPENNYAI_CACHE_DIR' not in os.environ:
+    os.environ['OPENNYAI_CACHE_DIR'] = '/tmp/opennyai_temp'
+    os.makedirs('/tmp/opennyai_temp', exist_ok=True)
 
 class RhetoricalRolePredictor():
 
@@ -48,8 +52,11 @@ def initialize(self):
         Instantiates Tokenizer for preprocessor to use
         Loads labels to name mapping file for post-processing inference response
         """
-        self.CACHE_DIR = os.path.join(str(Path.home()), '.opennyai')
-        self.hsln_format_txt_dirpath = os.path.join(self.CACHE_DIR, 'temp_hsln/pubmed-20k', )
+        # Force writable cache directory for HF Space
+        self.CACHE_DIR = os.getenv('OPENNYAI_CACHE_DIR', '/tmp/opennyai_temp')
+        os.makedirs(self.CACHE_DIR, exist_ok=True)
+
+        self.hsln_format_txt_dirpath = os.path.join(self.CACHE_DIR, 'temp_hsln/pubmed-20k')
         os.makedirs(self.hsln_format_txt_dirpath, exist_ok=True)
 
         if self.use_gpu:

diff --git a/opennyai/summarizer/models/model_builder.py b/opennyai/summarizer/models/model_builder.py
@@ -3,7 +3,7 @@
 
 import torch
 import torch.nn as nn
-from pytorch_transformers import BertModel, BertConfig
+from transformers import BertModel, BertConfig
 from torch.nn.init import xavier_uniform_
 
 from opennyai.utils.download import CACHE_DIR
@@ -120,20 +120,31 @@ def get_generator(vocab_size, dec_hidden_size, device):
 class Bert(nn.Module):
     def __init__(self, large, temp_dir=EXTRACTIVE_SUMMARIZER_CACHE_PATH, finetune=False):
         super(Bert, self).__init__()
+
+        # Ensure cache directory exists and is writable
+        os.makedirs(temp_dir, exist_ok=True)
+
         if (large):
             self.model = BertModel.from_pretrained('bert-large-uncased', cache_dir=temp_dir)
         else:
             self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir)
-
+        
         self.finetune = finetune
-
+    
     def forward(self, x, segs, mask):
         if (self.finetune):
-            top_vec, _ = self.model(x, segs, attention_mask=mask)
+            outputs = self.model(input_ids=x, token_type_ids=segs, attention_mask=mask)
         else:
             self.eval()
             with torch.no_grad():
-                top_vec, _ = self.model(x, segs, attention_mask=mask)
+                outputs = self.model(input_ids=x, token_type_ids=segs, attention_mask=mask)
+
+        # Extract last_hidden_state from outputs
+        if hasattr(outputs, 'last_hidden_state'):
+            top_vec = outputs.last_hidden_state
+        else:
+            top_vec = outputs[0]
+
         return top_vec
 
 
@@ -166,7 +177,7 @@ def __init__(self, args, device, checkpoint):
             self.bert.model.embeddings.position_embeddings = my_pos_embeddings
 
         if checkpoint is not None:
-            self.load_state_dict(checkpoint['model'], strict=True)
+            self.load_state_dict(checkpoint['model'], strict=False)
         else:
             if args.param_init != 0.0:
                 for p in self.ext_layer.parameters():

diff --git a/opennyai/summarizer/others/args.py b/opennyai/summarizer/others/args.py
@@ -18,7 +18,7 @@ def __setargs__():
     parser.bert_data_path = EXTRACTIVE_SUMMARIZER_CACHE_PATH
     parser.model_path = EXTRACTIVE_SUMMARIZER_CACHE_PATH
     parser.result_path = EXTRACTIVE_SUMMARIZER_CACHE_PATH
-    parser.temp_dir = EXTRACTIVE_SUMMARIZER_CACHE_PATH
+    parser.temp_dir = '/tmp/huggingface'
     parser.batch_size = 5000
     parser.test_batch_size = 1
     parser.max_pos = 512

diff --git a/opennyai/summarizer/others/tokenization.py b/opennyai/summarizer/others/tokenization.py
@@ -21,7 +21,6 @@
 import unicodedata
 from io import open
 
-from pytorch_transformers import cached_path
 from wasabi import msg
 
 from opennyai.utils.download import CACHE_DIR
@@ -137,38 +136,43 @@ def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=EXTRACTIVE_SUM
         Instantiate a PreTrainedBertModel from a pre-trained model file.
         Download and cache the pre-trained model file if needed.
         """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            vocab_file = pretrained_model_name_or_path
-        if os.path.isdir(vocab_file):
-            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
-        # redirect to the cache, if necessary
+        from transformers import BertTokenizer as ModernBertTokenizer
+
         try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            msg.fail(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    vocab_file))
+            # Let transformers handle the download and caching
+            modern_tokenizer = ModernBertTokenizer.from_pretrained(
+                pretrained_model_name_or_path,
+                do_lower_case=kwargs.get('do_lower_case', True),
+                cache_dir=cache_dir
+            )
+
+            # Create vocab file from the modern tokenizer's vocab
+            os.makedirs(cache_dir, exist_ok=True)
+            vocab_file = os.path.join(cache_dir, f'{pretrained_model_name_or_path.replace("/", "_")}_vocab.txt')
+
+            # Write vocab to file if it doesn't exist
+            if not os.path.isfile(vocab_file):
+                with open(vocab_file, 'w', encoding='utf-8') as f:
+                    # Get vocab dict and sort by token id
+                    vocab_dict = modern_tokenizer.get_vocab()
+                    sorted_vocab = sorted(vocab_dict.items(), key=lambda x: x[1])
+                    for token, idx in sorted_vocab:
+                        f.write(token + '\n')
+
+            # Set max_len if specified
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+                max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+                kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+
+            # Instantiate our custom tokenizer with the vocab file
+            tokenizer = cls(vocab_file, *inputs, **kwargs)
+            return tokenizer
+
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            msg.fail(f"Could not load tokenizer '{pretrained_model_name_or_path}': {e}")
             return None
-        # if resolved_vocab_file == vocab_file:
-        #     msg.info("loading vocabulary file {}".format(vocab_file))
-        # else:
-        #     msg.info("loading vocabulary file {} from cache at {}".format(
-        #         vocab_file, resolved_vocab_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer won't index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
-        return tokenizer
-
 
 class BasicTokenizer(object):
     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

diff --git a/opennyai/utils/download.py b/opennyai/utils/download.py
@@ -2,21 +2,38 @@
 import subprocess
 import sys
 from pathlib import Path
-
 import torch
+from huggingface_hub import hf_hub_download
 
 """Functions for downloading opennyai ner models."""
+
 PIP_INSTALLER_URLS = {
     "en_legal_ner_trf": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/en_legal_ner_trf-any-py3-none-any.whl",
     "en_legal_ner_sm": "https://huggingface.co/opennyaiorg/en_legal_ner_sm/resolve/main/en_legal_ner_sm-any-py3-none-any.whl",
     "en_core_web_md": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/STOCK_SPACY_MODELS/en_core_web_md-3.2.0-py3-none-any.whl",
     "en_core_web_sm": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/STOCK_SPACY_MODELS/en_core_web_sm-3.2.0-py3-none-any.whl",
-    "en_core_web_trf": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/STOCK_SPACY_MODELS/en_core_web_trf-3.2.0-py3-none-any.whl"}
+    "en_core_web_trf": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/STOCK_SPACY_MODELS/en_core_web_trf-3.2.0-py3-none-any.whl"
+}
+
 TORCH_PT_MODEL_URLS = {
     "RhetoricalRole": "https://huggingface.co/opennyaiorg/InRhetoricalRoles/resolve/main/InRhetoricalRoleModel.pt",
     "ExtractiveSummarizer": "https://huggingface.co/opennyaiorg/InExtractiveSummarizer/resolve/main/InExtractiveSummarizerModel.pt"
 }
-CACHE_DIR = os.path.join(str(Path.home()), '.opennyai')
+
+# Model repo info for new hf_hub_download API
+HF_MODEL_REPOS = {
+    "RhetoricalRole": {
+        "repo_id": "opennyaiorg/InRhetoricalRoles",
+        "filename": "InRhetoricalRoleModel.pt"
+    },
+    "ExtractiveSummarizer": {
+        "repo_id": "opennyaiorg/InExtractiveSummarizer",
+        "filename": "InExtractiveSummarizerModel.pt"
+    }
+}
+
+# Use /tmp for HuggingFace Spaces (writable), fallback to home directory
+CACHE_DIR = os.getenv('OPENNYAI_CACHE_DIR', os.path.join('/tmp', 'opennyai'))
 
 
 def install(package: str):
@@ -26,7 +43,8 @@ def install(package: str):
         package (string): wheel file url
     """
     subprocess.check_call(
-        [sys.executable, "-m", "pip", "install", package, "--no-deps"], stdout=subprocess.DEVNULL
+        [sys.executable, "-m", "pip", "install", package, "--no-deps"], 
+        stdout=subprocess.DEVNULL
     )
 
 
@@ -36,10 +54,23 @@ def load_model_from_cache(model_name: str):
     Args:
         model_name (string): model name to download and save
     """
-    if TORCH_PT_MODEL_URLS.get(model_name) is None:
+    if model_name not in HF_MODEL_REPOS:
         raise RuntimeError(f'{model_name} is not supported by opennyai, please check the name!')
-    else:
-        model_url = TORCH_PT_MODEL_URLS[model_name]
-        os.makedirs(os.path.join(CACHE_DIR, model_name.lower()), exist_ok=True)
-        return torch.hub.load_state_dict_from_url(model_url, model_dir=os.path.join(CACHE_DIR, model_name.lower()),
-                                                  check_hash=True, map_location=torch.device('cpu'))
+
+    model_info = HF_MODEL_REPOS[model_name]
+    cache_dir = os.path.join(CACHE_DIR, model_name.lower())
+    os.makedirs(cache_dir, exist_ok=True)
+
+    # Download using new huggingface_hub API
+    print(f'Downloading: "{model_info["repo_id"]}/{model_info["filename"]}" to {cache_dir}')
+
+    model_path = hf_hub_download(
+        repo_id=model_info["repo_id"],
+        filename=model_info["filename"],
+        cache_dir=cache_dir
+    )
+
+    # Load the model
+    state_dict = torch.load(model_path, map_location=torch.device('cpu'))
+
+    return state_dict
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,19 +8,19 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = "^3.8 || ^3.9 || ^3.10"
 
-spacy = ">=3.2.2,<3.3.0"
+spacy = ">=3.7.0,<3.8.0" # CHANGED: Updated for Pydantic v2 compatibility
 pandas = ">1.2.4"
 beautifulsoup4 = ">=4.10.0"
 torch = ">=1.12.1"
 levenshtein = "^0.23.0"
 multiprocess = "^0.70.15"
-spacy-transformers = ">=1.1.4"
+spacy-transformers = ">=1.3.0,<1.4.0" # CHANGED: Updated for spaCy 3.7.x compatibility
 scikit-learn = "^1.3.2"
 pytest = "^7.4.3"
 prettytable = ">=3.1.1"
 nltk = ">=3.6"
-pytorch-transformers = "^1.2.0"
+transformers = ">=4.30.0,<4.40.0" # CHANGED: Replaced 'pytorch-transformers' with 'transformers'
 
 [build-system]
 requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
+build-backend = "poetry.core.masonry.api"