Fine-tuning an LLM with Unsloth and Serving with Ollama

A hands-on guide to fine-tuning and deploying LLMs efficiently

Open In Colab

Article: Fine-tuning LLM with Unsloth and serving it with Ollama

Table of Contents

  1. Setup & Installation
  2. Load Model
  3. Add LoRA Adapters
  4. Load & Format Dataset
  5. Fine-tuning
  6. Test Model
  7. Save & Export GGUF
  8. Run with Ollama
  9. API Usage

1. Setup & Installation

!pip install -q unsloth transformers datasets trl accelerate peft bitsandbytes sentencepiece

2. Load Model

Load a pre-trained model with 4-bit quantization using Unsloth for memory-efficient fine-tuning.

from unsloth import FastLanguageModel

max_seq_length = 2048
dtype = None  # Auto-detect
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-0.5B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print(f"Model loaded: {model.config._name_or_path}")
print(f"Parameters: {model.num_parameters():,}")

3. Add LoRA Adapters

Apply LoRA (Low-Rank Adaptation) to efficiently fine-tune only a small subset of parameters.

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

# Print trainable parameters
model.print_trainable_parameters()

4. Load & Format Dataset

Load the Alpaca dataset and format it for instruction fine-tuning using the chat template.

from datasets import load_dataset

# Load a subset of the Alpaca dataset
dataset = load_dataset("yahma/alpaca-cleaned", split="train[:200]")
print(f"Dataset size: {len(dataset)}")
print(f"Sample: {dataset[0]}")
def format_example(example):
    """Format an Alpaca example into chat template format."""
    instruction = example["instruction"]
    input_text = example.get("input", "")
    output = example["output"]

    if input_text:
        user_content = f"{instruction}\n\nInput: {input_text}"
    else:
        user_content = instruction

    messages = [
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": output}
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )

    return {"text": text}

# Apply formatting
dataset = dataset.map(format_example)
print(f"\nFormatted example:\n{dataset[0]['text'][:500]}")

5. Fine-tuning

Train the model using SFTTrainer (Supervised Fine-Tuning) from the TRL library.

from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        output_dir="outputs",
        seed=3407,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
    ),
)

# Train
trainer_stats = trainer.train()
print(f"\nTraining completed!")
print(f"Training loss: {trainer_stats.training_loss:.4f}")
print(f"Training time: {trainer_stats.metrics['train_runtime']:.1f}s")

6. Test Model

Run inference with the fine-tuned model to verify it works correctly.

# Switch to inference mode
FastLanguageModel.for_inference(model)

# Prepare test input
messages = [
    {"role": "user", "content": "Explain the concept of transfer learning in simple terms."}
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

# Generate response
outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=256,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)

response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)
print(f"Response:\n{response}")

7. Save & Export GGUF

Save the fine-tuned model and export it to GGUF format for use with Ollama.

# Save LoRA adapters
model.save_pretrained("finetuned_model")
tokenizer.save_pretrained("finetuned_model")
print("LoRA adapters saved to 'finetuned_model/'")

# Export to GGUF format (q4_k_m quantization)
model.save_pretrained_gguf(
    "finetuned_model_gguf",
    tokenizer,
    quantization_method="q4_k_m"
)
print("GGUF model exported to 'finetuned_model_gguf/'")

8. Run with Ollama

Create a Modelfile and use Ollama to serve the fine-tuned model locally.

Step 1: Create a Modelfile

FROM ./finetuned_model_gguf/unsloth.Q4_K_M.gguf

TEMPLATE """{{ if .System }}<|im_start|>system
{{ .System }}<|im_end|>
{{ end }}<|im_start|>user
{{ .Prompt }}<|im_end|>
<|im_start|>assistant
"""

PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER stop "<|im_end|>"

Step 2: Create and run the model

# Create the model in Ollama
ollama create my-finetuned-model -f Modelfile

# Run the model interactively
ollama run my-finetuned-model

# Or start the server
ollama serve

9. API Usage

Once the model is served via Ollama, you can interact with it through the API.

import requests
import json

def query_ollama(prompt: str, model: str = "my-finetuned-model") -> str:
    """Query the Ollama API with the fine-tuned model."""
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={
            "model": model,
            "prompt": prompt,
            "stream": False,
            "options": {
                "temperature": 0.7,
                "top_p": 0.9
            }
        }
    )
    response.raise_for_status()
    return response.json()["response"]

# Test the API
result = query_ollama("What is the difference between supervised and unsupervised learning?")
print(f"Response:\n{result}")