#!/usr/bin/env python3 """Interactive chat with base Gemma3-4B-IT (no LEM training).""" import sys sys.stdout.reconfigure(line_buffering=True) import mlx.core as mx from mlx_lm import load, generate from mlx_lm.sample_utils import make_sampler mx.metal.set_memory_limit(24 * 1024**3) mx.metal.set_cache_limit(8 * 1024**3) MODEL_PATH = '/Volumes/Data/lem/gemma-3-4b-it-mlx' print(f'Loading Gemma3-4B-IT (base)...') model, tokenizer = load(MODEL_PATH) _set_infer = getattr(model, 'eval') _set_infer() print('Ready.\n') sampler = make_sampler(temp=0.7) history = [] while True: try: user_input = input('You: ').strip() except (EOFError, KeyboardInterrupt): print('\nBye.') break if not user_input: continue if user_input.lower() == '/clear': history = [] print('History cleared.\n') continue history.append({'role': 'user', 'content': user_input}) prompt_text = tokenizer.apply_chat_template( history, tokenize=False, add_generation_prompt=True, ) response = generate(model, tokenizer, prompt=prompt_text, max_tokens=512, sampler=sampler) history.append({'role': 'assistant', 'content': response}) print(f'\nGemma: {response}\n') mx.clear_cache()