Optimized-native Inference API¶

These helpers expose the low-level optimized-native path directly. For new high-level application code, prefer RuntimeClient.

Download only runtime-critical Hugging Face artifacts into a local model directory.

Source code in src/ollm/runtime/materialization.py

def download_hf_snapshot(
    repo_id: str,
    model_dir: str,
    force_download: bool = False,
    revision: str | None = None,
) -> None:
    """Download only runtime-critical Hugging Face artifacts into a local model directory."""
    target_dir = Path(model_dir).expanduser().resolve()
    target_dir.mkdir(parents=True, exist_ok=True)
    prune_hf_runtime_artifacts(target_dir)
    LOGGER.info("Downloading runtime artifacts for %s.", repo_id)
    snapshot_download_fn = cast(_SnapshotDownloadCallable, snapshot_download)
    try:
        snapshot_download_fn(
            repo_id=repo_id,
            local_dir=str(target_dir),
            local_dir_use_symlinks=False,
            force_download=force_download,
            revision=revision,
            allow_patterns=list(HF_RUNTIME_ARTIFACT_PATTERNS),
        )
    except Exception as exc:
        prune_hf_runtime_artifacts(target_dir)
        artifact_gaps = _runtime_artifact_gaps(target_dir)
        if not artifact_gaps:
            LOGGER.warning(
                "Hugging Face download for %s raised %s after materialization completed.",
                repo_id,
                type(exc).__name__,
            )
            return
        raise ManagedModelDownloadError(
            repo_id,
            target_dir,
            artifact_gaps,
        ) from exc
    prune_hf_runtime_artifacts(target_dir)
    artifact_gaps = _runtime_artifact_gaps(target_dir)
    if artifact_gaps:
        raise ManagedModelDownloadError(repo_id, target_dir, artifact_gaps)

Direct optimized-native helper for built-in aliases.

This class is the low-level optimized-native entry point. It is best suited for direct model control in scripts that intentionally want to bypass the higher-level RuntimeClient surface.

Parameters:

Name	Type	Description	Default
`model_id`	`str`	Built-in optimized alias to load.	required
`device`	`str`	Target device string such as `"cuda:0"` or `"mps"`.	`'cuda:0'`
`logging`	`bool`	Whether to collect runtime stats.	`True`
`multimodality`	`bool`	Whether the runtime should plan for multimodal execution.	`False`
`specialization_registry`	`SpecializationRegistry \| None`	Optional specialization registry override.	`None`
`resolver`	`ModelResolver \| None`	Optional model resolver override.	`None`

Source code in src/ollm/inference.py

class Inference:
    """Direct optimized-native helper for built-in aliases.

    This class is the low-level optimized-native entry point. It is best suited
    for direct model control in scripts that intentionally want to bypass the
    higher-level ``RuntimeClient`` surface.

    Args:
        model_id (str): Built-in optimized alias to load.
        device (str): Target device string such as ``"cuda:0"`` or ``"mps"``.
        logging (bool): Whether to collect runtime stats.
        multimodality (bool): Whether the runtime should plan for multimodal
            execution.
        specialization_registry (SpecializationRegistry | None): Optional
            specialization registry override.
        resolver (ModelResolver | None): Optional model resolver override.
    """

    def __init__(
        self,
        model_id: str,
        device: str = "cuda:0",
        logging: bool = True,
        multimodality: bool = False,
        specialization_registry: SpecializationRegistry | None = None,
        resolver: ModelResolver | None = None,
    ):
        self.model_id = model_id
        self.model_reference = model_id
        self.optimized_model_id = model_id
        self.device = torch.device(device)
        self.multimodality = multimodality
        self.stats = Stats() if logging else None
        self._specialization_registry = (
            build_default_specialization_registry()
            if specialization_registry is None
            else specialization_registry
        )
        self._resolver = resolver or ModelResolver()
        self._specialization_pipeline = SpecializationPipeline()
        self._cache_factory = None
        self._apply_cpu_offload = None
        self._apply_gpu_offload = None
        self.loaded_resolved_model = None
        self.loaded_specialization_provider_id = None
        self.loaded_applied_specialization_pass_ids = ()

    def hf_download(self, model_dir: str, force_download: bool = False) -> None:
        """Download the built-in optimized alias into a local directory.

        Args:
            model_dir (str): Target local model directory.
            force_download (bool): Whether to force a fresh snapshot download.

        Raises:
            ValueError: Raised when the current ``optimized_model_id`` is not a
                built-in optimized alias.
        """
        entry = find_model_catalog_entry(self.optimized_model_id)
        if entry is None:
            raise ValueError(
                f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
            )
        download_hf_snapshot(entry.repo_id, model_dir, force_download=force_download)

    def ini_model(
        self, models_dir: str = "./models/", force_download: bool = False
    ) -> None:
        """Download if needed and then load the optimized-native runtime.

        Args:
            models_dir (str): Parent directory that will contain the managed
                model directory.
            force_download (bool): Whether to force a fresh snapshot download.

        Raises:
            ValueError: Raised when the current optimized alias is invalid or the
                local model directory cannot be prepared.
        """
        entry = find_model_catalog_entry(self.optimized_model_id)
        if entry is None:
            raise ValueError(
                f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
            )

        model_dir = Path(models_dir).expanduser().resolve() / entry.model_id
        if force_download or not model_dir.exists():
            self.hf_download(str(model_dir), force_download=force_download)
        prune_hf_runtime_artifacts(model_dir)
        if not hf_runtime_artifacts_complete(model_dir):
            self.hf_download(str(model_dir), force_download=force_download)
            prune_hf_runtime_artifacts(model_dir)

        self.load_model(str(model_dir))

    def load_model(self, model_dir: str) -> None:
        """Load an optimized-native runtime from a local directory.

        Args:
            model_dir (str): Local model directory for the optimized alias.

        Raises:
            ValueError: Raised when the path does not exist or the current
                optimized alias is invalid.
        """
        model_path = Path(model_dir).expanduser().resolve()
        if not model_path.exists() or not model_path.is_dir():
            raise ValueError(f"Model directory does not exist: {model_path}")
        entry = find_model_catalog_entry(self.optimized_model_id)
        if entry is None:
            raise ValueError(
                f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
            )
        self._load_optimized_model(
            model_path,
            ModelSourceKind.BUILTIN,
            self.model_reference,
            catalog_entry=entry,
        )

    def _load_optimized_model(
        self,
        model_path: Path,
        source_kind: ModelSourceKind,
        raw_reference: str,
        catalog_entry: ModelCatalogEntry | None = None,
    ) -> None:
        entry = catalog_entry
        if source_kind is ModelSourceKind.BUILTIN:
            entry = find_model_catalog_entry(self.optimized_model_id)
            if entry is None:
                raise ValueError(
                    f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
                )

        LOGGER.info("Loading optimized model from %s.", model_path)
        runtime_config = RuntimeConfig(
            model_reference=raw_reference,
            device=str(self.device),
            multimodal=self.multimodality,
            stats=self.stats is not None,
        )
        resolved_model = self._resolver.inspect_materialized_model(
            ModelReference.parse(raw_reference),
            model_path,
            source_kind=source_kind,
            repo_id=None if entry is None else entry.repo_id,
            revision=None,
            catalog_entry=entry,
        )
        specialization_match = self._specialization_registry.select(
            resolved_model, runtime_config
        )
        if specialization_match is None:
            raise ValueError(
                f"No optimized specialization provider is available for {self.model_id!r} at {model_path}"
            )
        planned_specialization = self._specialization_pipeline.plan(
            resolved_model,
            runtime_config,
            specialization_match.provider_id,
        )
        artifacts = self._specialization_registry.load(
            specialization_match.provider_id,
            resolved_model,
            runtime_config,
            self.stats,
        )
        applied_specialization = apply_specialization(
            planned_specialization,
            artifacts,
            runtime_config,
        )
        self.loaded_resolved_model = resolved_model
        self.loaded_specialization_provider_id = specialization_match.provider_id
        self.loaded_applied_specialization_pass_ids = (
            applied_specialization.applied_pass_ids
        )
        self.model = artifacts.model
        self.tokenizer = artifacts.tokenizer
        if artifacts.processor is None:
            if hasattr(self, "processor"):
                delattr(self, "processor")
        else:
            self.processor = artifacts.processor
        self._cache_factory = artifacts.create_cache
        self._apply_cpu_offload = artifacts.apply_cpu_offload
        self._apply_gpu_offload = artifacts.apply_gpu_offload

    def offload_layers_to_cpu(self, layers_num: int, policy: str = "prefix") -> None:
        """Apply CPU layer offload through the selected specialization.

        Args:
            layers_num (int): Number of layers to place on CPU.
            policy (str): Placement policy such as ``"prefix"``,
                ``"suffix"``, or ``"middle-band"``.

        Raises:
            ValueError: Raised when the model is not loaded or the selected
                specialization does not expose CPU offload support.
        """
        if self._apply_cpu_offload is None:
            raise ValueError(f"{self.model_id} does not support CPU layer offload")
        model = getattr(self, "model", None)
        if model is None:
            raise ValueError(f"{self.model_id} is not loaded")
        placement = plan_cpu_offload_placement(
            requested_layers=layers_num,
            total_layers=require_hidden_layer_count(model),
            policy=policy,
        )
        self._apply_cpu_offload(placement.layer_indices)

    def offload_layers_to_gpu_cpu(
        self, gpu_layers_num: int = 0, cpu_layers_num: int = 0
    ) -> None:
        """Apply mixed GPU/CPU layer placement when the specialization exposes it.

        Args:
            gpu_layers_num (int): Number of layers to keep on the accelerator.
            cpu_layers_num (int): Number of layers to move to CPU.

        Raises:
            ValueError: Raised when the specialization does not expose mixed
                placement support.
        """
        if gpu_layers_num == 0 and cpu_layers_num == 0:
            return
        if self._apply_gpu_offload is None:
            raise ValueError(f"{self.model_id} does not support GPU layer offload")
        self._apply_gpu_offload(gpu_layers_num, cpu_layers_num)

    def DiskCache(
        self,
        cache_dir: str = "./kvcache",
        cache_strategy: str | None = None,
        cache_lifecycle: str | None = None,
        cache_window_tokens: int | None = None,
    ) -> object | None:
        """Create the specialization-backed KV cache when supported.

        Args:
            cache_dir (str): Base cache directory for the cache instance.
            cache_strategy (str | None): Optional explicit KV strategy override.
            cache_lifecycle (str | None): Optional cache lifecycle override.
            cache_window_tokens (int | None): Optional sliding-window token
                budget override.

        Returns:
            object | None: Specialization-backed cache object, or ``None`` when
            the loaded specialization does not expose a cache factory.
        """
        if self._cache_factory is None:
            return None
        return self._cache_factory(
            Path(cache_dir).expanduser().resolve(),
            cache_strategy,
            cache_lifecycle,
            cache_window_tokens,
        )

hf_download ¶

hf_download(
    model_dir: str, force_download: bool = False
) -> None

Download the built-in optimized alias into a local directory.

Parameters:

Name	Type	Description	Default
`model_dir`	`str`	Target local model directory.	required
`force_download`	`bool`	Whether to force a fresh snapshot download.	`False`

Raises:

Type	Description
`ValueError`	Raised when the current `optimized_model_id` is not a built-in optimized alias.

Source code in src/ollm/inference.py

def hf_download(self, model_dir: str, force_download: bool = False) -> None:
    """Download the built-in optimized alias into a local directory.

    Args:
        model_dir (str): Target local model directory.
        force_download (bool): Whether to force a fresh snapshot download.

    Raises:
        ValueError: Raised when the current ``optimized_model_id`` is not a
            built-in optimized alias.
    """
    entry = find_model_catalog_entry(self.optimized_model_id)
    if entry is None:
        raise ValueError(
            f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
        )
    download_hf_snapshot(entry.repo_id, model_dir, force_download=force_download)

ini_model ¶

ini_model(
    models_dir: str = "./models/",
    force_download: bool = False,
) -> None

Download if needed and then load the optimized-native runtime.

Parameters:

Name	Type	Description	Default
`models_dir`	`str`	Parent directory that will contain the managed model directory.	`'./models/'`
`force_download`	`bool`	Whether to force a fresh snapshot download.	`False`

Raises:

Type	Description
`ValueError`	Raised when the current optimized alias is invalid or the local model directory cannot be prepared.

Source code in src/ollm/inference.py

def ini_model(
    self, models_dir: str = "./models/", force_download: bool = False
) -> None:
    """Download if needed and then load the optimized-native runtime.

    Args:
        models_dir (str): Parent directory that will contain the managed
            model directory.
        force_download (bool): Whether to force a fresh snapshot download.

    Raises:
        ValueError: Raised when the current optimized alias is invalid or the
            local model directory cannot be prepared.
    """
    entry = find_model_catalog_entry(self.optimized_model_id)
    if entry is None:
        raise ValueError(
            f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
        )

    model_dir = Path(models_dir).expanduser().resolve() / entry.model_id
    if force_download or not model_dir.exists():
        self.hf_download(str(model_dir), force_download=force_download)
    prune_hf_runtime_artifacts(model_dir)
    if not hf_runtime_artifacts_complete(model_dir):
        self.hf_download(str(model_dir), force_download=force_download)
        prune_hf_runtime_artifacts(model_dir)

    self.load_model(str(model_dir))

load_model ¶

load_model(model_dir: str) -> None

Load an optimized-native runtime from a local directory.

Parameters:

Name	Type	Description	Default
`model_dir`	`str`	Local model directory for the optimized alias.	required

Raises:

Type	Description
`ValueError`	Raised when the path does not exist or the current optimized alias is invalid.

Source code in src/ollm/inference.py

def load_model(self, model_dir: str) -> None:
    """Load an optimized-native runtime from a local directory.

    Args:
        model_dir (str): Local model directory for the optimized alias.

    Raises:
        ValueError: Raised when the path does not exist or the current
            optimized alias is invalid.
    """
    model_path = Path(model_dir).expanduser().resolve()
    if not model_path.exists() or not model_path.is_dir():
        raise ValueError(f"Model directory does not exist: {model_path}")
    entry = find_model_catalog_entry(self.optimized_model_id)
    if entry is None:
        raise ValueError(
            f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
        )
    self._load_optimized_model(
        model_path,
        ModelSourceKind.BUILTIN,
        self.model_reference,
        catalog_entry=entry,
    )

offload_layers_to_cpu ¶

offload_layers_to_cpu(
    layers_num: int, policy: str = "prefix"
) -> None

Apply CPU layer offload through the selected specialization.

Parameters:

Name	Type	Description	Default
`layers_num`	`int`	Number of layers to place on CPU.	required
`policy`	`str`	Placement policy such as `"prefix"`, `"suffix"`, or `"middle-band"`.	`'prefix'`

Raises:

Type	Description
`ValueError`	Raised when the model is not loaded or the selected specialization does not expose CPU offload support.

Source code in src/ollm/inference.py

def offload_layers_to_cpu(self, layers_num: int, policy: str = "prefix") -> None:
    """Apply CPU layer offload through the selected specialization.

    Args:
        layers_num (int): Number of layers to place on CPU.
        policy (str): Placement policy such as ``"prefix"``,
            ``"suffix"``, or ``"middle-band"``.

    Raises:
        ValueError: Raised when the model is not loaded or the selected
            specialization does not expose CPU offload support.
    """
    if self._apply_cpu_offload is None:
        raise ValueError(f"{self.model_id} does not support CPU layer offload")
    model = getattr(self, "model", None)
    if model is None:
        raise ValueError(f"{self.model_id} is not loaded")
    placement = plan_cpu_offload_placement(
        requested_layers=layers_num,
        total_layers=require_hidden_layer_count(model),
        policy=policy,
    )
    self._apply_cpu_offload(placement.layer_indices)

offload_layers_to_gpu_cpu ¶

offload_layers_to_gpu_cpu(
    gpu_layers_num: int = 0, cpu_layers_num: int = 0
) -> None

Apply mixed GPU/CPU layer placement when the specialization exposes it.

Parameters:

Name	Type	Description	Default
`gpu_layers_num`	`int`	Number of layers to keep on the accelerator.	`0`
`cpu_layers_num`	`int`	Number of layers to move to CPU.	`0`

Raises:

Type	Description
`ValueError`	Raised when the specialization does not expose mixed placement support.

Source code in src/ollm/inference.py

def offload_layers_to_gpu_cpu(
    self, gpu_layers_num: int = 0, cpu_layers_num: int = 0
) -> None:
    """Apply mixed GPU/CPU layer placement when the specialization exposes it.

    Args:
        gpu_layers_num (int): Number of layers to keep on the accelerator.
        cpu_layers_num (int): Number of layers to move to CPU.

    Raises:
        ValueError: Raised when the specialization does not expose mixed
            placement support.
    """
    if gpu_layers_num == 0 and cpu_layers_num == 0:
        return
    if self._apply_gpu_offload is None:
        raise ValueError(f"{self.model_id} does not support GPU layer offload")
    self._apply_gpu_offload(gpu_layers_num, cpu_layers_num)

DiskCache ¶

DiskCache(
    cache_dir: str = "./kvcache",
    cache_strategy: str | None = None,
    cache_lifecycle: str | None = None,
    cache_window_tokens: int | None = None,
) -> object | None

Create the specialization-backed KV cache when supported.

Parameters:

Name	Type	Description	Default
`cache_dir`	`str`	Base cache directory for the cache instance.	`'./kvcache'`
`cache_strategy`	`str \| None`	Optional explicit KV strategy override.	`None`
`cache_lifecycle`	`str \| None`	Optional cache lifecycle override.	`None`
`cache_window_tokens`	`int \| None`	Optional sliding-window token budget override.	`None`

Returns:

Type	Description
`object \| None`	object \| None: Specialization-backed cache object, or `None` when
`object \| None`	the loaded specialization does not expose a cache factory.

Source code in src/ollm/inference.py

def DiskCache(
    self,
    cache_dir: str = "./kvcache",
    cache_strategy: str | None = None,
    cache_lifecycle: str | None = None,
    cache_window_tokens: int | None = None,
) -> object | None:
    """Create the specialization-backed KV cache when supported.

    Args:
        cache_dir (str): Base cache directory for the cache instance.
        cache_strategy (str | None): Optional explicit KV strategy override.
        cache_lifecycle (str | None): Optional cache lifecycle override.
        cache_window_tokens (int | None): Optional sliding-window token
            budget override.

    Returns:
        object | None: Specialization-backed cache object, or ``None`` when
        the loaded specialization does not expose a cache factory.
    """
    if self._cache_factory is None:
        return None
    return self._cache_factory(
        Path(cache_dir).expanduser().resolve(),
        cache_strategy,
        cache_lifecycle,
        cache_window_tokens,
    )

Bases: Inference

Optimized-native helper for compatible local model directories.

AutoInference inspects a local model directory, infers the matching optimized-native family, and then loads the same optimized path that Inference uses for built-in aliases.

Parameters:

Name	Type	Description	Default
`model_dir`	`str`	Local model directory to inspect and load.	required
`adapter_dir`	`str \| None`	Optional LoRA adapter directory.	`None`
`device`	`str`	Target device string such as `"cuda:0"` or `"mps"`.	`'cuda:0'`
`logging`	`bool`	Whether to collect runtime stats.	`True`
`multimodality`	`bool`	Whether the runtime should plan for multimodal execution.	`False`
`specialization_registry`	`SpecializationRegistry \| None`	Optional specialization registry override.	`None`
`resolver`	`ModelResolver \| None`	Optional model resolver override.	`None`

Raises:

Type	Description
`ValueError`	Raised when the local model directory is missing, uses an unsupported architecture, or references an invalid adapter directory.

Source code in src/ollm/inference.py

class AutoInference(Inference):
    """Optimized-native helper for compatible local model directories.

    ``AutoInference`` inspects a local model directory, infers the matching
    optimized-native family, and then loads the same optimized path that
    ``Inference`` uses for built-in aliases.

    Args:
        model_dir (str): Local model directory to inspect and load.
        adapter_dir (str | None): Optional LoRA adapter directory.
        device (str): Target device string such as ``"cuda:0"`` or ``"mps"``.
        logging (bool): Whether to collect runtime stats.
        multimodality (bool): Whether the runtime should plan for multimodal
            execution.
        specialization_registry (SpecializationRegistry | None): Optional
            specialization registry override.
        resolver (ModelResolver | None): Optional model resolver override.

    Raises:
        ValueError: Raised when the local model directory is missing, uses an
            unsupported architecture, or references an invalid adapter
            directory.
    """

    def __init__(
        self,
        model_dir: str,
        adapter_dir: str | None = None,
        device: str = "cuda:0",
        logging: bool = True,
        multimodality: bool = False,
        specialization_registry: SpecializationRegistry | None = None,
        resolver: ModelResolver | None = None,
    ):
        model_path = Path(model_dir).expanduser().resolve()
        if not model_path.exists() or not model_path.is_dir():
            raise ValueError(f"Local model directory does not exist: {model_path}")

        config = AutoConfig.from_pretrained(model_path)
        architectures = getattr(config, "architectures", None) or ()
        architecture = architectures[0] if architectures else None
        if architecture == "LlamaForCausalLM":
            optimized_model_id = "llama3-1B-chat"
        elif architecture in {"Gemma3ForConditionalGeneration", "Gemma3ForCausalLM"}:
            optimized_model_id = "gemma3-12B"
        else:
            raise ValueError(
                f"The current optimized path cannot run architecture {architecture!r}. "
                "Use a built-in optimized alias or a compatible local Llama/Gemma3 model directory."
            )

        super().__init__(
            str(model_path),
            device=device,
            logging=logging,
            multimodality=multimodality,
            specialization_registry=specialization_registry,
            resolver=resolver,
        )
        self.optimized_model_id = optimized_model_id
        self.model_reference = str(model_path)
        self._load_optimized_model(
            model_path,
            ModelSourceKind.LOCAL_PATH,
            self.model_reference,
        )
        if adapter_dir:
            adapter_path = Path(adapter_dir).expanduser().resolve()
            if not adapter_path.exists() or not adapter_path.is_dir():
                raise ValueError(f"Adapter directory does not exist: {adapter_path}")
            validate_safe_adapter_artifacts(adapter_path)
            lora_config_cls, get_peft_model = _load_peft_symbols()
            peft_config = lora_config_cls.from_pretrained(str(adapter_path))
            peft_model = cast(
                _PeftModelProtocol, get_peft_model(self.model, peft_config)
            )
            peft_model.load_adapter(
                str(adapter_path), adapter_name="default", use_safetensors=True
            )
            self.model = peft_model