!pip install -q unsloth transformers datasets trl accelerate peft bitsandbytes sentencepieceFine-tuning an LLM with Unsloth and Serving with Ollama
A hands-on guide to fine-tuning and deploying LLMs efficiently
Article: Fine-tuning LLM with Unsloth and serving it with Ollama
Table of Contents
1. Setup & Installation
2. Load Model
Load a pre-trained model with 4-bit quantization using Unsloth for memory-efficient fine-tuning.
from unsloth import FastLanguageModel
max_seq_length = 2048
dtype = None # Auto-detect
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Qwen2.5-0.5B-Instruct",
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
print(f"Model loaded: {model.config._name_or_path}")
print(f"Parameters: {model.num_parameters():,}")3. Add LoRA Adapters
Apply LoRA (Low-Rank Adaptation) to efficiently fine-tune only a small subset of parameters.
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
)
# Print trainable parameters
model.print_trainable_parameters()4. Load & Format Dataset
Load the Alpaca dataset and format it for instruction fine-tuning using the chat template.
from datasets import load_dataset
# Load a subset of the Alpaca dataset
dataset = load_dataset("yahma/alpaca-cleaned", split="train[:200]")
print(f"Dataset size: {len(dataset)}")
print(f"Sample: {dataset[0]}")def format_example(example):
"""Format an Alpaca example into chat template format."""
instruction = example["instruction"]
input_text = example.get("input", "")
output = example["output"]
if input_text:
user_content = f"{instruction}\n\nInput: {input_text}"
else:
user_content = instruction
messages = [
{"role": "user", "content": user_content},
{"role": "assistant", "content": output}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False
)
return {"text": text}
# Apply formatting
dataset = dataset.map(format_example)
print(f"\nFormatted example:\n{dataset[0]['text'][:500]}")5. Fine-tuning
Train the model using SFTTrainer (Supervised Fine-Tuning) from the TRL library.
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
args=SFTConfig(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_steps=5,
max_steps=60,
learning_rate=2e-4,
fp16=True,
logging_steps=10,
output_dir="outputs",
seed=3407,
dataset_text_field="text",
max_seq_length=max_seq_length,
),
)
# Train
trainer_stats = trainer.train()
print(f"\nTraining completed!")
print(f"Training loss: {trainer_stats.training_loss:.4f}")
print(f"Training time: {trainer_stats.metrics['train_runtime']:.1f}s")6. Test Model
Run inference with the fine-tuned model to verify it works correctly.
# Switch to inference mode
FastLanguageModel.for_inference(model)
# Prepare test input
messages = [
{"role": "user", "content": "Explain the concept of transfer learning in simple terms."}
]
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt"
).to(model.device)
# Generate response
outputs = model.generate(
input_ids=inputs,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
do_sample=True,
)
response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)
print(f"Response:\n{response}")7. Save & Export GGUF
Save the fine-tuned model and export it to GGUF format for use with Ollama.
# Save LoRA adapters
model.save_pretrained("finetuned_model")
tokenizer.save_pretrained("finetuned_model")
print("LoRA adapters saved to 'finetuned_model/'")
# Export to GGUF format (q4_k_m quantization)
model.save_pretrained_gguf(
"finetuned_model_gguf",
tokenizer,
quantization_method="q4_k_m"
)
print("GGUF model exported to 'finetuned_model_gguf/'")8. Run with Ollama
Create a Modelfile and use Ollama to serve the fine-tuned model locally.
Step 1: Create a Modelfile
FROM ./finetuned_model_gguf/unsloth.Q4_K_M.gguf
TEMPLATE """{{ if .System }}<|im_start|>system
{{ .System }}<|im_end|>
{{ end }}<|im_start|>user
{{ .Prompt }}<|im_end|>
<|im_start|>assistant
"""
PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER stop "<|im_end|>"Step 2: Create and run the model
# Create the model in Ollama
ollama create my-finetuned-model -f Modelfile
# Run the model interactively
ollama run my-finetuned-model
# Or start the server
ollama serve9. API Usage
Once the model is served via Ollama, you can interact with it through the API.
import requests
import json
def query_ollama(prompt: str, model: str = "my-finetuned-model") -> str:
"""Query the Ollama API with the fine-tuned model."""
response = requests.post(
"http://localhost:11434/api/generate",
json={
"model": model,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.7,
"top_p": 0.9
}
}
)
response.raise_for_status()
return response.json()["response"]
# Test the API
result = query_ollama("What is the difference between supervised and unsupervised learning?")
print(f"Response:\n{result}")