Source code
Revision control
Copy as Markdown
Other Tools
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
#ifndef mozilla_llama_backend_h
#define mozilla_llama_backend_h
#include <functional>
#include "llama/llama.h"
#include "mozilla/dom/LlamaRunnerBinding.h"
#include "mozilla/Result.h"
#include "mozilla/UniquePtr.h"
namespace mozilla::llama {
struct Error {
nsCString mMessage;
};
using ChatMessageResult = mozilla::Result<nsCString, Error>;
using ResultStatus = mozilla::Result<mozilla::Ok, Error>;
using LlamaChatResponse = mozilla::dom::LlamaChatResponse;
using LlamaChatPhase = mozilla::dom::LlamaChatPhase;
using LlamaModelOptions = mozilla::dom::LlamaModelOptions;
using LlamaKVCacheDtype = mozilla::dom::LlamaKVCacheDtype;
using LlamaChatOptions = mozilla::dom::LlamaChatOptions;
using LlamaSamplerType = mozilla::dom::LlamaSamplerType;
using LlamaContextOptions = mozilla::dom::LlamaContextOptions;
using LlamaSamplerConfig = mozilla::dom::LlamaSamplerConfig;
ggml_type GgmlTypeFromKVCacheDtype(LlamaKVCacheDtype aDtype);
// LlamaBackend is a low-level, internal interface to the llama.cpp engine.
// It encapsulates model loading, prompt formatting, context setup, and
// token-by-token generation with streaming callbacks.
//
// This class is **not** exposed to JS or WebIDL — it's intended for internal
// orchestration only, typically via LlamaRunner or LlamaGenerateTask.
//
// Usage Pattern:
// 1. Construct the class using the default constructor.
// 2. Call `Reinitialize(...)` to load a model.
// 3. (Optional) Use `FormatChat(...)` to build a prompt from chat messages.
// 4. Call `Generate(...)` to start streaming token output.
// 5. To update **context-only** settings (not the model), call
// `ReinitializeContext(...)`.
// 6. To update both the model and context, call `Reinitialize(...)`.
//
// At the moment, this class is not thread-safe, as it holds a reference to the
// context state `mCtx`, `mThreadpool`, `mThreadpoolBatch` instead of
// re-initializing them at each generation.
class LlamaBackend {
public:
NS_INLINE_DECL_THREADSAFE_REFCOUNTING(LlamaBackend)
// Default constructor. Call Reinitialize before use.
LlamaBackend() = default;
// Reinitializes the entire backend (model + context).
// Use this if you need to load a different model.
ResultStatus Reinitialize(const LlamaModelOptions& aOptions, FILE* aFp);
// Converts structured chat messages into a flat prompt string.
// Useful for models expecting a plain-text prompt.
ChatMessageResult FormatChat(
const mozilla::dom::LlamaFormatChatOptions& aOptions);
// Generates a sequence of tokens using the current model/context.
// Calls `aTokenCallback` with each token; supports early termination via
// `aCancelCallback`. The generation has the prompt phase followed by the
// generation phase. Message sent to aTokenCallback
// will have an identifier for each phase along with a boolean
// indicating if the phase is completed. Note that the callbacks are called
// synchronously. `Reinitialize` must be call at least once before calling
// this function.
ResultStatus Generate(
const LlamaChatOptions& aOptions,
std::function<ResultStatus(const LlamaChatResponse&)> aTokenCallback,
std::function<bool()> aCancelCallback);
// Reinitializes the context only (same model).
// Use this to change generation parameters like context size, temperature,
// etc.
ResultStatus ReinitializeContext(const LlamaContextOptions& aOptions,
int aNumContext);
// Custom deleters for managing llama.cpp and ggml resources
struct GgmlThreadpoolDeleter {
void operator()(ggml_threadpool* aTp) const { ggml_threadpool_free(aTp); }
};
struct LlamaModelDeleter {
void operator()(llama_model* aModel) const { llama_model_free(aModel); }
};
struct LlamaContextDeleter {
void operator()(llama_context* aCtx) const { llama_free(aCtx); }
};
struct LlamaSamplerDeleter {
void operator()(llama_sampler* aSmpl) const { llama_sampler_free(aSmpl); }
};
// Smart pointer types for safe resource cleanup
using GgmlThreadpoolUPtr =
mozilla::UniquePtr<ggml_threadpool, GgmlThreadpoolDeleter>;
using LlamaModelUPtr = mozilla::UniquePtr<llama_model, LlamaModelDeleter>;
using LlamaContextUPtr =
mozilla::UniquePtr<llama_context, LlamaContextDeleter>;
using LlamaSamplerUPtr =
mozilla::UniquePtr<llama_sampler, LlamaSamplerDeleter>;
// sampler result alias
using SamplerResult = mozilla::Result<LlamaSamplerUPtr, Error>;
// Protected because RefPtr requires a non-public destructor
protected:
~LlamaBackend();
private:
SamplerResult InitializeSampler(
const mozilla::dom::Sequence<LlamaSamplerConfig>& aSamplers);
// Holds the model data. Initialized once & reused across generation sessions.
LlamaModelUPtr mModel;
// Generation context. Initialized once & reused across generation sessions.
// However, it is automatically re-initialized if the configured
// context length is lower than the required number for a generation session.
LlamaContextUPtr mCtx;
// Threadpool used for processing single tokens, usually during decoding.
// Initialized once & reused across generation sessions.
GgmlThreadpoolUPtr mThreadpool;
// Used for processing multiple tokens at a time during prompt processing.
// Initialized once & reused across generation session. This uses mThreadpool
// if the configured number of threads are identical.
GgmlThreadpoolUPtr mThreadpoolBatch;
LlamaModelOptions mModelOptions;
// Cached model name/info string for logging (e.g. "Smollm2 360M 8k ...")
nsCString mModelGeneralName;
};
// Alias for shared pointer type
using LlamaBackendSPtr = RefPtr<LlamaBackend>;
} // namespace mozilla::llama
#endif