Build a Streamlit WebUI for Your Local MLX Model

In Part 1 of this series, we installed MLX and ran a local large language model (LLM) like Mistral 7B Instruct directly on macOS.
Now, let’s take it a step further and build a web-based chat interface using Streamlit — so you can interact with your local model just like ChatGPT, right from your browser.

Why Streamlit?

Streamlit is a powerful Python library for turning scripts into web apps — no JavaScript required.
It’s perfect for creating an intuitive and fast chat UI that connects to your MLX model.

You’ll be able to:

Chat with your local LLM through a browser
Stream model responses in real time
(Optional) Add support for PDFs, image inputs, or parameter tuning later

Step 1: Install Streamlit and Dependencies

If you already have your MLX environment set up (from Part 1), make sure you’re inside your virtual environment:

cd ~/dev/mlx-ui
source .venv/bin/activate

Then install the necessary packages:

pip install streamlit mlx torch torchvision sentencepiece

To test your Streamlit installation:

streamlit hello

This will open a demo app at http://localhost:8501.

Once you see the sample dashboard, close it with Ctrl+C

Step 2: Create Your Streamlit App File

Inside your ~/dev/mlx-ui folder, create a new file named:

streamlit_app.py

Open it in your favorite code editor and add the following code. Remember to add path to your local model in model_id.

Open your local MLX model stored in the hidden cache folder by navigating to it in Terminal. You need path to the config.json file which is in a subfolder within snapshots folder

open ~/.cache/huggingface/hub


import sys
import inspect
import streamlit as st
from mlx_lm import load, stream_generate

# ---------------------- Page & Sidebar --------------------

st.set_page_config(page_title="Local MLX Chat", layout="centered")

with st.sidebar:
    st.header("⚙️ Settings")
    model_repo = st.text_input(
        "Model (HF repo or local path)",
        value="/Users/amanyadav/.cache/huggingface/hub/models--mlx-community--Mistral-7B-Instruct-v0.3-4bit/snapshots/a4b8f870474b0eb527f466a03fbc187830d271f5",
        help="Any MLX-compatible HF repo or a local path to an MLX-converted model.",
    )
    temperature = st.slider("Temperature", 0.0, 1.5, 0.7, 0.05)
    max_tokens = st.slider("Max tokens", 64, 4096, 512, 32)
    top_p = st.slider("Top-p", 0.0, 1.0, 0.95, 0.01)
    system_prompt = st.text_area(
        "System prompt",
        value="You are a helpful, concise assistant.",
        height=100,
    )
    trust_remote = st.checkbox(
        "Trust remote tokenizer code (needed for some models, e.g., Qwen)",
        value=False,
    )

# ---------------------- Model Loading (cached) ----------------------
@st.cache_resource(show_spinner=True)
def load_model(repo: str, trust_remote_code: bool):
    tokenizer_cfg = {"trust_remote_code": bool(trust_remote_code)}
    model, tokenizer = load(repo, tokenizer_config=tokenizer_cfg)
    return model, tokenizer

with st.spinner("Loading model…"):
    model, tokenizer = load_model(model_repo, trust_remote)

# ---------------------- Chat State Helpers ----------------------
def ensure_history_initialized(current_system_prompt: str):
    """Create/refresh chat history and keep the latest system prompt as the first message."""
    if "messages" not in st.session_state:
        st.session_state.messages = [{"role": "system", "content": current_system_prompt}]
    # Always reflect the latest system prompt in the first message
    st.session_state.messages[0] = {"role": "system", "content": current_system_prompt}

ensure_history_initialized(system_prompt)

st.title("MLX Local Chat")

# Render prior messages (skip system message at index 0)
for msg in st.session_state.messages[1:]:
    with st.chat_message(msg["role"]):
        st.markdown(msg["content"])

# ---------------------- Compatibility Streaming Wrapper ----------------------
def _stream_generate_compat(model, tokenizer, prompt, max_tokens, temperature, top_p):
    """
    Calls mlx_lm.stream_generate with only the kwargs that your installed version supports.
    Handles: temperature/temp, max_tokens/max_new_tokens, optional top_p.
    """
    try:
        params = set(inspect.signature(stream_generate).parameters.keys())
    except Exception:
        params = set()

    kwargs = {}
    # token count kw
    if "max_tokens" in params:
        kwargs["max_tokens"] = max_tokens
    elif "max_new_tokens" in params:
        kwargs["max_new_tokens"] = max_tokens

    # temperature kw
    if "temperature" in params:
        kwargs["temperature"] = temperature
    elif "temp" in params:
        kwargs["temp"] = temperature

    # optional top_p
    if "top_p" in params:
        kwargs["top_p"] = top_p

    # sensible fallback if we couldn't inspect
    if not kwargs:
        kwargs = {"max_tokens": max_tokens, "temperature": temperature, "top_p": top_p}

    # stream
    yield from stream_generate(model, tokenizer, prompt, **kwargs)

def stream_reply(messages):
    # Use chat template if available; fall back to a simple formatted prompt
    try:
        prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    except Exception:
        sys_msg = next((m["content"] for m in messages if m["role"] == "system"), "")
        convo = "\n".join(
            f"{m['role'].upper()}: {m['content']}" for m in messages if m["role"] != "system"
        )
        prompt = (f"System: {sys_msg}\n{convo}\nASSISTANT:" if sys_msg else f"{convo}\nASSISTANT:")

    for resp in _stream_generate_compat(
        model, tokenizer, prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
    ):
        # Newer mlx-lm yields objects with .text; some older versions yield plain strings
        chunk = getattr(resp, "text", None)
        yield chunk if chunk is not None else str(resp)

# ---------------------- UI Actions: Regenerate & Clear ----------------------
def do_assistant_turn():
    """Stream a reply and append it to chat history."""
    with st.chat_message("assistant"):
        full = st.write_stream(stream_reply(st.session_state.messages))
    st.session_state.messages.append({"role": "assistant", "content": full})

c1, c2 = st.columns(2)
with c1:
    regen_clicked = st.button("🔁 Regenerate response", use_container_width=True)
with c2:
    clear_clicked = st.button("🗑️ Clear chat", use_container_width=True)

# Clear chat: keep current system prompt
if clear_clicked:
    st.session_state.messages = [{"role": "system", "content": system_prompt}]
    st.rerun()

# Regenerate: remove last assistant reply (if present) and re-run
if regen_clicked:
    has_user = any(m["role"] == "user" for m in st.session_state.messages)
    if not has_user:
        st.warning("Nothing to regenerate yet — send a message first.")
    else:
        if st.session_state.messages[-1]["role"] == "assistant":
            st.session_state.messages.pop()
        do_assistant_turn()
        st.rerun()

# ---------------------- Chat Input ----------------------
user_input = st.chat_input("Ask me anything…")
if user_input:
    st.session_state.messages.append({"role": "user", "content": user_input})
    do_assistant_turn()

▶️ Step 3: Run the WebUI

In your terminal, make sure you’re still in the virtual environment and run:

streamlit run streamlit_app.py

Your browser should open automatically to:

http://localhost:8501

You’ll now see a clean chat interface — type a message and your MLX-powered model will respond right from your Mac!

Step 5: Stop and Deactivate

When you’re done chatting:

Stop the Streamlit app in Terminal with Ctrl + C
Deactivate the virtual environment:

deactivate

🧾 Summary

Step	Task	Command
1	Activate environment	`source .venv/bin/activate`
2	Install dependencies	`pip install streamlit mlx torch torchvision sentencepiece`
3	Create the Streamlit app	`streamlit_app.py`
4	Run the app	`streamlit run streamlit_app.py`
5	Open browser	http://localhost:8501