AmolMagar2000 · azmatsiddique · Feb 8, 2026
diff --git a/bitNet_rag.py b/bitNet_rag.py
@@ -27,46 +27,60 @@
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain.document_loaders import PyMuPDFLoader
+from llama_cpp import Llama  # Import Llama for direct model usage
 
 # ── Paths & CLI helper ─────────────────────────────────────────────────────────
 MODEL_PATH = os.path.abspath("models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf")
-CLI_PATH   = os.path.abspath("bin/bitnet-cli")
 
 st.write(f"Loading model from: `{MODEL_PATH}`")
 st.write(f"Exists? {os.path.exists(MODEL_PATH)}")
 
+@st.cache_resource
+def load_model():
+    """
+    Load the BitNet model once and cache it in memory.
+    t_gpu_layers=-1 enables full GPU offloading on Mac (Metal) if supported.
+    """
+    if not os.path.exists(MODEL_PATH):
+        st.error(f"Model not found at `{MODEL_PATH}`.\n\nPlease run `python download_model.py` to download it.")
+        return None
+
+    return Llama(
+        model_path=MODEL_PATH,
+        n_ctx=2048,        # Context window size
+        n_threads=4,       # Number of threads
+        n_gpu_layers=-1    # Offload all layers to GPU (Metal) if available
+    )
+
+# Load the model directly (cached)
+llm = load_model()
+
 def generate_with_bitnet_stream(
     prompt: str,
-    threads: int = 4,
-    n_predict: int = 128,
     temp: float = 0.2,
     top_p: float = 0.9,
+    max_tokens: int = 512
 ):
     """
-    Launch the BitNet CLI and yield each character as soon as it's emitted.
+    Generate text using the persistent Llama object.
     """
-    cmd = [
-        CLI_PATH,
-        "--model", MODEL_PATH,
-        "--prompt", prompt,
-        "--threads", str(threads),
-        "--n_predict", str(n_predict),
-        "--temp", str(temp),
-        "--top_p", str(top_p),
-    ]
-    proc = subprocess.Popen(
-        cmd,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.DEVNULL,
-        bufsize=1,
-        text=True,
+    if llm is None:
+        yield "Error: Model not loaded."
+        return
+
+    stream = llm.create_completion(
+        prompt,
+        temperature=temp,
+        top_p=top_p,
+        max_tokens=max_tokens,
+        stream=True
     )
-    while True:
-        char = proc.stdout.read(1)
-        if not char:
-            break
-        yield char
-    proc.wait()
+
+
+    for output in stream:
+        # Check if 'choices' and 'text' exist in the output chunk
+        text_chunk = output['choices'][0]['text']
+        yield text_chunk
 
 # ── Session state initialization ───────────────────────────────────────────────
 st.session_state.setdefault('chat_history', [])
@@ -145,7 +159,7 @@ def handle_query(question: str) -> str:
     placeholder = st.empty()
     output = ""
 
-    for ch in generate_with_bitnet_stream(prompt, threads=4, n_predict=128):
+    for ch in generate_with_bitnet_stream(prompt, max_tokens=256):
         output += ch
         placeholder.markdown(output)
 

diff --git a/download_model.py b/download_model.py
@@ -0,0 +1,23 @@
+
+import os
+from huggingface_hub import hf_hub_download
+
+MODEL_REPO = "microsoft/bitnet-b1.58-2B-4T-gguf"
+MODEL_FILENAME = "ggml-model-i2_s.gguf"
+LOCAL_DIR = "models/BitNet-b1.58-2B-4T"
+
+def download_model():
+    print(f"Downloading {MODEL_FILENAME} from {MODEL_REPO}...")
+    try:
+        model_path = hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename=MODEL_FILENAME,
+            local_dir=LOCAL_DIR,
+            local_dir_use_symlinks=False
+        )
+        print(f"Model downloaded successfully to: {model_path}")
+    except Exception as e:
+        print(f"Failed to download model: {e}")
+
+if __name__ == "__main__":
+    download_model()
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,8 @@
-pip install --upgrade pips
-pip install llama-cpp-python streamlit langchain langchain-text-splitters \
-            langchain-embeddings-transformers langchain-huggingface \
-            faiss-cpu sentence-transformers pymupdf
+llama-cpp-python
+streamlit
+langchain
+langchain-text-splitters
+langchain-huggingface
+faiss-cpu
+sentence-transformers
+pymupdf