Merge pull request #23 from decodingml/feat/inference-pipeline-tweaks

Feat/inference pipeline tweaks
decodingml · May 31, 2024 · c744bfe · c744bfe
2 parents 777f2a5 + 50d2406
commit c744bfe
Show file tree

Hide file tree

Showing 13 changed files with 199 additions and 28 deletions.
diff --git a/course/module-4/Makefile b/course/module-4/Makefile
@@ -21,17 +21,15 @@ help:
 
 list: help
 
-create-qwak-project:
+create-qwak-model: # Create a Qwak model before building it.
 	@echo "$(YELLOW)Creating Qwak project $(RESET)"
 	qwak models create "llm_twin" --project "llm-twin-course"
 
-deploy:
+build-llm-microservice: # Build the Qwak model.
 	@echo "$(YELLOW)Dumping poetry env requirements to $(RESET) $(GREEN) requirements.txt $(RESET)"
 	# poetry export -f requirements.txt --output finetuning/requirements.txt --without-hashes
 	@echo "$(GREEN)Triggering Qwak Model Build$(RESET)"
 	poetry run qwak models build -f build_config.yaml .
 
-test:
+test-llm-microservice-locally: # Test the Qwak model locally.
 	poetry run python test_local.py
-
-
diff --git a/course/module-4/build_config.yaml b/course/module-4/build_config.yaml
@@ -4,13 +4,13 @@ build_env:
     base_image: public.ecr.aws/qwak-us-east-1/qwak-base:0.0.13-gpu
     cache: true
     env_vars:
-    - HUGGINGFACE_ACCESS_TOKEN=
-    - COMET_API_KEY=
-    - COMET_WORKSPACE=
-    - COMET_PROJECT=llm-twin-course
+      - HUGGINGFACE_ACCESS_TOKEN=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
+      - COMET_API_KEY=5qccQFthifX6ZYUcLrTSjKSpw
+      - COMET_WORKSPACE=decodingml
+      - COMET_PROJECT=llm-twin-course
     no_cache: false
     params: []
-    push: true 
+    push: true
   python_env:
     dependency_file_path: finetuning/requirements.txt
     git_credentials: null
@@ -49,4 +49,3 @@ step:
   validate_build_artifact: true
   validate_build_artifact_timeout: 120
 verbose: 0
-
diff --git a/course/module-4/finetuning/logger_utils.py b/course/module-4/finetuning/logger_utils.py
@@ -0,0 +1,5 @@
+import structlog
+
+
+def get_logger(cls: str):
+    return structlog.get_logger().bind(cls=cls)
diff --git a/course/module-4/finetuning/model.py b/course/module-4/finetuning/model.py
@@ -7,6 +7,7 @@
 import yaml
 from comet_ml import Experiment
 from datasets import DatasetDict, load_dataset
+
 from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
 from qwak.model.adapters import DefaultOutputAdapter
 from qwak.model.base import QwakModel
@@ -23,6 +24,7 @@
 
 from finetuning.dataset_client import DatasetClient
 from finetuning.settings import settings
+from finetuning.utils import build_qlora_model
 
 
 class CopywriterMistralModel(QwakModel):
@@ -80,6 +82,8 @@ def build(self) -> None:
         logging.info(f"Finished saving model to {self.model_save_dir}")
 
         if self.experiment:
+            self.experiment.log_model("llm-twin", self.model_save_dir)
+
             self.experiment.end()
 
         self._remove_model_class_attributes()
@@ -112,7 +116,7 @@ def init_model(self) -> None:
         )
         self.tokenizer.pad_token = self.tokenizer.eos_token
         self.tokenizer.padding_side = "right"
-        
+
         logging.info(f"Initialized model {self.model_type} successfully")
 
     def _initialize_qlora(self, model: PreTrainedModel) -> PeftModel:
@@ -124,7 +128,7 @@ def _initialize_qlora(self, model: PreTrainedModel) -> PeftModel:
 
         model = prepare_model_for_kbit_training(model)
         model = get_peft_model(model, self.qlora_config)
-        
+
         logging.info("Initialized QLoRA config successfully!")
 
         return model
@@ -135,7 +139,7 @@ def _init_trainig_args(self) -> None:
         self.training_arguments = TrainingArguments(**config["training_arguments"])
         if self.experiment:
             self.experiment.log_parameters(self.training_arguments)
-            
+
         logging.info("Initialized training arguments successfully!")
 
     def load_dataset(self) -> DatasetDict:
@@ -155,7 +159,7 @@ def load_dataset(self) -> DatasetDict:
     def preprocess_data_split(self, train_val_datasets: DatasetDict) -> tuple:
         train_data = train_val_datasets["train"]
         val_data = train_val_datasets["validation"]
-        
+
         generated_train_dataset = train_data.map(self.generate_prompt)
         generated_train_dataset = generated_train_dataset.remove_columns(
             ["instruction", "content"]
@@ -192,10 +196,16 @@ def tokenize(self, prompt: str) -> dict:
         return result
 
     def initialize_model(self) -> None:
-        self.model = AutoModelForCausalLM.from_pretrained(
-            self.model_save_dir,
-            token=settings.HUGGINGFACE_ACCESS_TOKEN,
-            quantization_config=self.nf4_config,
+        # self.model = AutoModelForCausalLM.from_pretrained(
+        #     self.model_save_dir,
+        #     token=settings.HUGGINGFACE_ACCESS_TOKEN,
+        #     quantization_config=self.nf4_config,
+        # )
+        self.model, self.tokenizer, _ = build_qlora_model(
+            pretrained_model_name_or_path=self.model_type,
+            peft_pretrained_model_name_or_path="llm-twin-lora",
+            bnb_config=self.nf4_config,
+            lora_config=self.qlora_config,
         )
         logging.info(f"Successfully loaded model from {self.model_save_dir}")
 

diff --git a/course/module-4/finetuning/requirements.txt b/course/module-4/finetuning/requirements.txt
@@ -10,3 +10,4 @@ bitsandbytes==0.42.0
 pydantic_settings==2.2.1
 scikit-learn==1.4.2
 qwak-sdk==0.5.68
+structlog==24.2.0
diff --git a/course/module-4/finetuning/settings.py b/course/module-4/finetuning/settings.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
 
@@ -9,5 +10,7 @@ class AppSettings(BaseSettings):
     COMET_API_KEY: str = ""
     COMET_WORKSPACE: str = ""
     COMET_PROJECT: str = ""
+
+    CACHE_DIR: Path = Path("./cache")
 
 settings = AppSettings()
diff --git a/course/module-4/finetuning/utils.py b/course/module-4/finetuning/utils.py
@@ -0,0 +1,131 @@
+import os
+from pathlib import Path
+from typing import Optional, Tuple
+
+import torch
+from peft import LoraConfig, PeftConfig, PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+from finetuning.settings import settings
+from finetuning import logger_utils
+
+
+logger = logger_utils.get_logger(__name__)
+
+
+def build_qlora_model(
+    pretrained_model_name_or_path: str,
+    peft_pretrained_model_name_or_path: Optional[str] = None,
+    bnb_config: Optional[BitsAndBytesConfig] = None,
+    lora_config: Optional[LoraConfig] = None,
+    cache_dir: Optional[Path] = None,
+) -> Tuple[AutoModelForCausalLM, AutoTokenizer, PeftConfig]:
+    """
+    Function that builds a QLoRA LLM model based on the given HuggingFace name:
+        1.   Create and prepare the bitsandbytes configuration for QLoRa's quantization
+        2.   Download, load, and quantize on-the-fly Falcon-7b
+        3.   Create and prepare the LoRa configuration
+        4.   Load and configuration Falcon-7B's tokenizer
+    """
+
+    if bnb_config is None:
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+
+    model = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path,
+        token=settings.HUGGINGFACE_ACCESS_TOKEN,
+        device_map=torch.cuda.current_device(),
+        quantization_config=bnb_config,
+        use_cache=False,
+        torchscript=True,
+        cache_dir=str(cache_dir) if cache_dir else None,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path,
+        token=settings.HUGGINGFACE_ACCESS_TOKEN,
+        cache_dir=str(cache_dir) if cache_dir else None,
+    )
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+
+    if peft_pretrained_model_name_or_path:
+        is_model_name = not os.path.isdir(peft_pretrained_model_name_or_path)
+        if is_model_name:
+            logger.info(
+                f"Downloading {peft_pretrained_model_name_or_path} from CometML Model Registry:"
+            )
+            peft_pretrained_model_name_or_path = download_from_model_registry(
+                model_id=peft_pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+            )
+
+        logger.info(f"Loading Lora Confing from: {peft_pretrained_model_name_or_path}")
+        lora_config = LoraConfig.from_pretrained(peft_pretrained_model_name_or_path)
+        assert (
+            lora_config.base_model_name_or_path == pretrained_model_name_or_path
+        ), f"Lora Model trained on different base model than the one requested: \
+        {lora_config.base_model_name_or_path} != {pretrained_model_name_or_path}"
+
+        logger.info(f"Loading Peft Model from: {peft_pretrained_model_name_or_path}")
+        model = PeftModel.from_pretrained(model, peft_pretrained_model_name_or_path)
+    else:
+        if lora_config is None:
+            lora_config = LoraConfig(
+                lora_alpha=16,
+                lora_dropout=0.1,
+                r=64,
+                bias="none",
+                task_type="CAUSAL_LM",
+            )
+
+    return model, tokenizer, lora_config
+
+
+def download_from_model_registry(
+    model_id: str, cache_dir: Optional[Path] = None
+) -> Path:
+    """
+    Downloads a model from the Comet ML Learning model registry.
+
+    Args:
+        model_id (str): The ID of the model to download, in the format "workspace/model_name:version".
+        cache_dir (Optional[Path]): The directory to cache the downloaded model in. Defaults to the value of
+            `constants.CACHE_DIR`.
+
+    Returns:
+        Path: The path to the downloaded model directory.
+    """
+
+    if cache_dir is None:
+        cache_dir = settings.CACHE_DIR
+    output_folder = cache_dir / "models" / model_id
+
+    already_downloaded = output_folder.exists()
+    if not already_downloaded:
+        workspace, model_id = model_id.split("/")
+        model_name, version = model_id.split(":")
+
+        api = API()
+        model = api.get_model(workspace=workspace, model_name=model_name)
+        model.download(version=version, output_folder=output_folder, expand=True)
+    else:
+        logger.info(f"Model {model_id=} already downloaded to: {output_folder}")
+
+    subdirs = [d for d in output_folder.iterdir() if d.is_dir()]
+    if len(subdirs) == 1:
+        model_dir = subdirs[0]
+    else:
+        raise RuntimeError(
+            f"There should be only one directory inside the model folder. \
+                Check the downloaded model at: {output_folder}"
+        )
+
+    logger.info(f"Model {model_id=} downloaded from the registry to: {model_dir}")
+
+    return model_dir
diff --git a/course/module-4/poetry.lock b/course/module-4/poetry.lock
diff --git a/course/module-4/pyproject.toml b/course/module-4/pyproject.toml
@@ -21,6 +21,7 @@ bitsandbytes = "^0.42.0"
 pydantic_settings="^2.2.1"
 scikit-learn = "^1.4.2" 
 qwak-sdk="^0.5.68"
+structlog = "^24.2.0"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/course/module-5/Makefile b/course/module-5/Makefile
@@ -1,6 +1,12 @@
 help:
 	@grep -E '^[a-zA-Z0-9 -]+:.*#'  Makefile | sort | while read -r l; do printf "\033[1;32m$$(echo $$l | cut -f 1 -d':')\033[00m:$$(echo $$l | cut -f 2- -d'#')\n"; done
 
-call-inference-pipeline: # Test the inference pipeline.
+deploy-llm-microservice: # Deploy the Qwak model.
+	qwak models deploy realtime --model-id "llm_twin" --instance "gpu.a10.2xl" --timeout 50000 --replicas 2 --server-workers 2
+
+undeploy-llm-microservice: # Deploy the Qwak model.
+	qwak models undeploy --model-id "llm_twin"
+
+call-inference-pipeline: # Call the inference pipeline.
 	poetry run python main.py
 
diff --git a/course/module-5/main.py b/course/module-5/main.py
@@ -16,8 +16,8 @@
 
     response = inference_endpoint.generate(
         query=query,
-        enable_rag=False,
-        enable_evaluation=True,
+        enable_rag=True,
+        enable_evaluation=False,
         enable_monitoring=True,
     )
 

diff --git a/course/module-5/rag/retriever.py b/course/module-5/rag/retriever.py
@@ -29,7 +29,7 @@ def __init__(self, query: str) -> None:
     def _search_single_query(
         self, generated_query: str, metadata_filter_value: str, k: int
     ):
-        assert k > 3, "k should be greater than 3"
+        assert k >= 3, "k should be greater than 3"
 
         query_vector = self._embedder.encode(generated_query).tolist()
 

diff --git a/course/module-5/settings.py b/course/module-5/settings.py
@@ -41,9 +41,9 @@ class AppSettings(BaseSettings):
     QWAK_DEPLOYMENT_MODEL_ID: str = "llm_twin"
 
     # RAG config
-    TOP_K: int = 5
-    KEEP_TOP_K: int = 5
-    EXPAND_N_QUERY: int = 5
+    TOP_K: int = 3
+    KEEP_TOP_K: int = 3
+    EXPAND_N_QUERY: int = 3
 
 
 settings = AppSettings()