Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 40 additions & 26 deletions bitNet_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,46 +27,60 @@
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader
from llama_cpp import Llama # Import Llama for direct model usage

# ── Paths & CLI helper ─────────────────────────────────────────────────────────
MODEL_PATH = os.path.abspath("models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf")
CLI_PATH = os.path.abspath("bin/bitnet-cli")

st.write(f"Loading model from: `{MODEL_PATH}`")
st.write(f"Exists? {os.path.exists(MODEL_PATH)}")

@st.cache_resource
def load_model():
"""
Load the BitNet model once and cache it in memory.
t_gpu_layers=-1 enables full GPU offloading on Mac (Metal) if supported.
"""
if not os.path.exists(MODEL_PATH):
st.error(f"Model not found at `{MODEL_PATH}`.\n\nPlease run `python download_model.py` to download it.")
return None

return Llama(
model_path=MODEL_PATH,
n_ctx=2048, # Context window size
n_threads=4, # Number of threads
n_gpu_layers=-1 # Offload all layers to GPU (Metal) if available
)

# Load the model directly (cached)
llm = load_model()

def generate_with_bitnet_stream(
prompt: str,
threads: int = 4,
n_predict: int = 128,
temp: float = 0.2,
top_p: float = 0.9,
max_tokens: int = 512
):
"""
Launch the BitNet CLI and yield each character as soon as it's emitted.
Generate text using the persistent Llama object.
"""
cmd = [
CLI_PATH,
"--model", MODEL_PATH,
"--prompt", prompt,
"--threads", str(threads),
"--n_predict", str(n_predict),
"--temp", str(temp),
"--top_p", str(top_p),
]
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
bufsize=1,
text=True,
if llm is None:
yield "Error: Model not loaded."
return

stream = llm.create_completion(
prompt,
temperature=temp,
top_p=top_p,
max_tokens=max_tokens,
stream=True
)
while True:
char = proc.stdout.read(1)
if not char:
break
yield char
proc.wait()


for output in stream:
# Check if 'choices' and 'text' exist in the output chunk
text_chunk = output['choices'][0]['text']
yield text_chunk

# ── Session state initialization ───────────────────────────────────────────────
st.session_state.setdefault('chat_history', [])
Expand Down Expand Up @@ -145,7 +159,7 @@ def handle_query(question: str) -> str:
placeholder = st.empty()
output = ""

for ch in generate_with_bitnet_stream(prompt, threads=4, n_predict=128):
for ch in generate_with_bitnet_stream(prompt, max_tokens=256):
output += ch
placeholder.markdown(output)

Expand Down
23 changes: 23 additions & 0 deletions download_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

import os
from huggingface_hub import hf_hub_download

MODEL_REPO = "microsoft/bitnet-b1.58-2B-4T-gguf"
MODEL_FILENAME = "ggml-model-i2_s.gguf"
LOCAL_DIR = "models/BitNet-b1.58-2B-4T"

def download_model():
print(f"Downloading {MODEL_FILENAME} from {MODEL_REPO}...")
try:
model_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILENAME,
local_dir=LOCAL_DIR,
local_dir_use_symlinks=False
)
print(f"Model downloaded successfully to: {model_path}")
except Exception as e:
print(f"Failed to download model: {e}")

if __name__ == "__main__":
download_model()
12 changes: 8 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
pip install --upgrade pips
pip install llama-cpp-python streamlit langchain langchain-text-splitters \
langchain-embeddings-transformers langchain-huggingface \
faiss-cpu sentence-transformers pymupdf
llama-cpp-python
streamlit
langchain
langchain-text-splitters
langchain-huggingface
faiss-cpu
sentence-transformers
pymupdf