diff --git a/opennyai/rhetorical_roles/infer_data_prep.py b/opennyai/rhetorical_roles/infer_data_prep.py index 11e8e64..13da4f2 100644 --- a/opennyai/rhetorical_roles/infer_data_prep.py +++ b/opennyai/rhetorical_roles/infer_data_prep.py @@ -34,39 +34,42 @@ def attach_short_sentence_boundries_to_next(revised_sentence_boundries, doc_txt) def split_into_sentences_tokenize_write(data, custom_processed_data_path, hsln_format_txt_dirpath='datasets/pubmed-20k', verbose=False): - ########## This function accepts the input files in LS format, creates tokens and writes them with label as "NONE" to text file + ########## This function accepts plain text input and processes it for rhetorical role prediction if not os.path.exists(hsln_format_txt_dirpath): os.makedirs(hsln_format_txt_dirpath) + + # Load spaCy model once + nlp = spacy.load("en_core_web_sm") + max_length = 10000 output_json = [] - filename_sent_boundries = {} ###### key is the filename and value is dict containing sentence spans {"abc.txt":{"sentence_span":[(1,10),(11,20),...]} , "pqr.txt":{...},...} + filename_sent_boundries = {} + if verbose: msg.info('Preprocessing rhetorical role model input!!!') + for data_dict in tqdm(data, disable=not verbose): - doc_id = data_dict['file_id'] - preamble_doc = data_dict['preamble_doc'] - judgment_doc = data_dict['judgement_doc'] - - if filename_sent_boundries.get(doc_id) is None: ##### Ignore if the file is already present + text = data_dict['text'] # Plain text string now - nlp_doc = spacy.tokens.Doc.from_docs([preamble_doc, judgment_doc]) + if filename_sent_boundries.get(doc_id) is None: + # Process text with spaCy + nlp_doc = nlp(text) doc_txt = nlp_doc.text + sentence_boundries = [(sent.start_char, sent.end_char) for sent in nlp_doc.sents] revised_sentence_boundries = attach_short_sentence_boundries_to_next(sentence_boundries, doc_txt) - adjudicated_doc = {'id': doc_id, - 'data': {'preamble_text': preamble_doc.text, - 'judgement_text': judgment_doc.text, - 'text': doc_txt} - } + adjudicated_doc = { + 'id': doc_id, + 'data': {'text': doc_txt} + } adjudicated_doc['annotations'] = [] - adjudicated_doc['annotations'].append({}) - adjudicated_doc['annotations'] = [] filename_sent_boundries[doc_id] = {"sentence_span": []} + for sentence_boundry in revised_sentence_boundries: sentence_txt = doc_txt[sentence_boundry[0]:sentence_boundry[1]] @@ -79,11 +82,11 @@ def split_into_sentences_tokenize_write(data, custom_processed_data_path, sent_data['labels'] = [] adjudicated_doc['annotations'].append(sent_data) - output_json.append(adjudicated_doc) + output_json.append(adjudicated_doc) + with open(custom_processed_data_path, 'w+') as f: json.dump(output_json, f) - def write_in_hsln_format(input_json, hsln_format_txt_dirpath, tokenizer): # tokenizer = BertTokenizer.from_pretrained(BERT_VOCAB, do_lower_case=True) json_format = json.load(open(input_json)) diff --git a/opennyai/rhetorical_roles/rhetorical_roles.py b/opennyai/rhetorical_roles/rhetorical_roles.py index 58df5e9..fd30f6d 100644 --- a/opennyai/rhetorical_roles/rhetorical_roles.py +++ b/opennyai/rhetorical_roles/rhetorical_roles.py @@ -14,7 +14,11 @@ from .eval import infer_model from .models import BertHSLN from .task import pubmed_task - +# Ensure cache directory exists and is writable +import os +if 'OPENNYAI_CACHE_DIR' not in os.environ: + os.environ['OPENNYAI_CACHE_DIR'] = '/tmp/opennyai_temp' + os.makedirs('/tmp/opennyai_temp', exist_ok=True) class RhetoricalRolePredictor(): @@ -48,8 +52,11 @@ def initialize(self): Instantiates Tokenizer for preprocessor to use Loads labels to name mapping file for post-processing inference response """ - self.CACHE_DIR = os.path.join(str(Path.home()), '.opennyai') - self.hsln_format_txt_dirpath = os.path.join(self.CACHE_DIR, 'temp_hsln/pubmed-20k', ) + # Force writable cache directory for HF Space + self.CACHE_DIR = os.getenv('OPENNYAI_CACHE_DIR', '/tmp/opennyai_temp') + os.makedirs(self.CACHE_DIR, exist_ok=True) + + self.hsln_format_txt_dirpath = os.path.join(self.CACHE_DIR, 'temp_hsln/pubmed-20k') os.makedirs(self.hsln_format_txt_dirpath, exist_ok=True) if self.use_gpu: diff --git a/opennyai/summarizer/models/model_builder.py b/opennyai/summarizer/models/model_builder.py index 7eccd4b..b146408 100644 --- a/opennyai/summarizer/models/model_builder.py +++ b/opennyai/summarizer/models/model_builder.py @@ -3,7 +3,7 @@ import torch import torch.nn as nn -from pytorch_transformers import BertModel, BertConfig +from transformers import BertModel, BertConfig from torch.nn.init import xavier_uniform_ from opennyai.utils.download import CACHE_DIR @@ -120,20 +120,31 @@ def get_generator(vocab_size, dec_hidden_size, device): class Bert(nn.Module): def __init__(self, large, temp_dir=EXTRACTIVE_SUMMARIZER_CACHE_PATH, finetune=False): super(Bert, self).__init__() + + # Ensure cache directory exists and is writable + os.makedirs(temp_dir, exist_ok=True) + if (large): self.model = BertModel.from_pretrained('bert-large-uncased', cache_dir=temp_dir) else: self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir) - + self.finetune = finetune - + def forward(self, x, segs, mask): if (self.finetune): - top_vec, _ = self.model(x, segs, attention_mask=mask) + outputs = self.model(input_ids=x, token_type_ids=segs, attention_mask=mask) else: self.eval() with torch.no_grad(): - top_vec, _ = self.model(x, segs, attention_mask=mask) + outputs = self.model(input_ids=x, token_type_ids=segs, attention_mask=mask) + + # Extract last_hidden_state from outputs + if hasattr(outputs, 'last_hidden_state'): + top_vec = outputs.last_hidden_state + else: + top_vec = outputs[0] + return top_vec @@ -166,7 +177,7 @@ def __init__(self, args, device, checkpoint): self.bert.model.embeddings.position_embeddings = my_pos_embeddings if checkpoint is not None: - self.load_state_dict(checkpoint['model'], strict=True) + self.load_state_dict(checkpoint['model'], strict=False) else: if args.param_init != 0.0: for p in self.ext_layer.parameters(): diff --git a/opennyai/summarizer/others/args.py b/opennyai/summarizer/others/args.py index 4d93822..27f39fa 100644 --- a/opennyai/summarizer/others/args.py +++ b/opennyai/summarizer/others/args.py @@ -18,7 +18,7 @@ def __setargs__(): parser.bert_data_path = EXTRACTIVE_SUMMARIZER_CACHE_PATH parser.model_path = EXTRACTIVE_SUMMARIZER_CACHE_PATH parser.result_path = EXTRACTIVE_SUMMARIZER_CACHE_PATH - parser.temp_dir = EXTRACTIVE_SUMMARIZER_CACHE_PATH + parser.temp_dir = '/tmp/huggingface' parser.batch_size = 5000 parser.test_batch_size = 1 parser.max_pos = 512 diff --git a/opennyai/summarizer/others/tokenization.py b/opennyai/summarizer/others/tokenization.py index 116f1a9..46564dc 100644 --- a/opennyai/summarizer/others/tokenization.py +++ b/opennyai/summarizer/others/tokenization.py @@ -21,7 +21,6 @@ import unicodedata from io import open -from pytorch_transformers import cached_path from wasabi import msg from opennyai.utils.download import CACHE_DIR @@ -137,38 +136,43 @@ def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=EXTRACTIVE_SUM Instantiate a PreTrainedBertModel from a pre-trained model file. Download and cache the pre-trained model file if needed. """ - if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: - vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] - else: - vocab_file = pretrained_model_name_or_path - if os.path.isdir(vocab_file): - vocab_file = os.path.join(vocab_file, VOCAB_NAME) - # redirect to the cache, if necessary + from transformers import BertTokenizer as ModernBertTokenizer + try: - resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) - except EnvironmentError: - msg.fail( - "Model name '{}' was not found in model name list ({}). " - "We assumed '{}' was a path or url but couldn't find any file " - "associated to this path or url.".format( - pretrained_model_name_or_path, - ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), - vocab_file)) + # Let transformers handle the download and caching + modern_tokenizer = ModernBertTokenizer.from_pretrained( + pretrained_model_name_or_path, + do_lower_case=kwargs.get('do_lower_case', True), + cache_dir=cache_dir + ) + + # Create vocab file from the modern tokenizer's vocab + os.makedirs(cache_dir, exist_ok=True) + vocab_file = os.path.join(cache_dir, f'{pretrained_model_name_or_path.replace("/", "_")}_vocab.txt') + + # Write vocab to file if it doesn't exist + if not os.path.isfile(vocab_file): + with open(vocab_file, 'w', encoding='utf-8') as f: + # Get vocab dict and sort by token id + vocab_dict = modern_tokenizer.get_vocab() + sorted_vocab = sorted(vocab_dict.items(), key=lambda x: x[1]) + for token, idx in sorted_vocab: + f.write(token + '\n') + + # Set max_len if specified + if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: + max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] + kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) + + # Instantiate our custom tokenizer with the vocab file + tokenizer = cls(vocab_file, *inputs, **kwargs) + return tokenizer + + except Exception as e: + import traceback + traceback.print_exc() + msg.fail(f"Could not load tokenizer '{pretrained_model_name_or_path}': {e}") return None - # if resolved_vocab_file == vocab_file: - # msg.info("loading vocabulary file {}".format(vocab_file)) - # else: - # msg.info("loading vocabulary file {} from cache at {}".format( - # vocab_file, resolved_vocab_file)) - if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: - # if we're using a pretrained model, ensure the tokenizer won't index sequences longer - # than the number of positional embeddings - max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] - kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) - # Instantiate tokenizer. - tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) - return tokenizer - class BasicTokenizer(object): """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" diff --git a/opennyai/utils/download.py b/opennyai/utils/download.py index 6b33fbc..3f941ca 100644 --- a/opennyai/utils/download.py +++ b/opennyai/utils/download.py @@ -2,21 +2,38 @@ import subprocess import sys from pathlib import Path - import torch +from huggingface_hub import hf_hub_download """Functions for downloading opennyai ner models.""" + PIP_INSTALLER_URLS = { "en_legal_ner_trf": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/en_legal_ner_trf-any-py3-none-any.whl", "en_legal_ner_sm": "https://huggingface.co/opennyaiorg/en_legal_ner_sm/resolve/main/en_legal_ner_sm-any-py3-none-any.whl", "en_core_web_md": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/STOCK_SPACY_MODELS/en_core_web_md-3.2.0-py3-none-any.whl", "en_core_web_sm": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/STOCK_SPACY_MODELS/en_core_web_sm-3.2.0-py3-none-any.whl", - "en_core_web_trf": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/STOCK_SPACY_MODELS/en_core_web_trf-3.2.0-py3-none-any.whl"} + "en_core_web_trf": "https://huggingface.co/opennyaiorg/en_legal_ner_trf/resolve/main/STOCK_SPACY_MODELS/en_core_web_trf-3.2.0-py3-none-any.whl" +} + TORCH_PT_MODEL_URLS = { "RhetoricalRole": "https://huggingface.co/opennyaiorg/InRhetoricalRoles/resolve/main/InRhetoricalRoleModel.pt", "ExtractiveSummarizer": "https://huggingface.co/opennyaiorg/InExtractiveSummarizer/resolve/main/InExtractiveSummarizerModel.pt" } -CACHE_DIR = os.path.join(str(Path.home()), '.opennyai') + +# Model repo info for new hf_hub_download API +HF_MODEL_REPOS = { + "RhetoricalRole": { + "repo_id": "opennyaiorg/InRhetoricalRoles", + "filename": "InRhetoricalRoleModel.pt" + }, + "ExtractiveSummarizer": { + "repo_id": "opennyaiorg/InExtractiveSummarizer", + "filename": "InExtractiveSummarizerModel.pt" + } +} + +# Use /tmp for HuggingFace Spaces (writable), fallback to home directory +CACHE_DIR = os.getenv('OPENNYAI_CACHE_DIR', os.path.join('/tmp', 'opennyai')) def install(package: str): @@ -26,7 +43,8 @@ def install(package: str): package (string): wheel file url """ subprocess.check_call( - [sys.executable, "-m", "pip", "install", package, "--no-deps"], stdout=subprocess.DEVNULL + [sys.executable, "-m", "pip", "install", package, "--no-deps"], + stdout=subprocess.DEVNULL ) @@ -36,10 +54,23 @@ def load_model_from_cache(model_name: str): Args: model_name (string): model name to download and save """ - if TORCH_PT_MODEL_URLS.get(model_name) is None: + if model_name not in HF_MODEL_REPOS: raise RuntimeError(f'{model_name} is not supported by opennyai, please check the name!') - else: - model_url = TORCH_PT_MODEL_URLS[model_name] - os.makedirs(os.path.join(CACHE_DIR, model_name.lower()), exist_ok=True) - return torch.hub.load_state_dict_from_url(model_url, model_dir=os.path.join(CACHE_DIR, model_name.lower()), - check_hash=True, map_location=torch.device('cpu')) + + model_info = HF_MODEL_REPOS[model_name] + cache_dir = os.path.join(CACHE_DIR, model_name.lower()) + os.makedirs(cache_dir, exist_ok=True) + + # Download using new huggingface_hub API + print(f'Downloading: "{model_info["repo_id"]}/{model_info["filename"]}" to {cache_dir}') + + model_path = hf_hub_download( + repo_id=model_info["repo_id"], + filename=model_info["filename"], + cache_dir=cache_dir + ) + + # Load the model + state_dict = torch.load(model_path, map_location=torch.device('cpu')) + + return state_dict diff --git a/pyproject.toml b/pyproject.toml index 0d75289..21a0924 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,19 +8,19 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.8 || ^3.9 || ^3.10" -spacy = ">=3.2.2,<3.3.0" +spacy = ">=3.7.0,<3.8.0" # CHANGED: Updated for Pydantic v2 compatibility pandas = ">1.2.4" beautifulsoup4 = ">=4.10.0" torch = ">=1.12.1" levenshtein = "^0.23.0" multiprocess = "^0.70.15" -spacy-transformers = ">=1.1.4" +spacy-transformers = ">=1.3.0,<1.4.0" # CHANGED: Updated for spaCy 3.7.x compatibility scikit-learn = "^1.3.2" pytest = "^7.4.3" prettytable = ">=3.1.1" nltk = ">=3.6" -pytorch-transformers = "^1.2.0" +transformers = ">=4.30.0,<4.40.0" # CHANGED: Replaced 'pytorch-transformers' with 'transformers' [build-system] requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" +build-backend = "poetry.core.masonry.api" \ No newline at end of file