Skip to content

Optimized-native Inference API

These helpers expose the low-level optimized-native path directly. For new high-level application code, prefer RuntimeClient.

Download only runtime-critical Hugging Face artifacts into a local model directory.

Source code in src/ollm/runtime/materialization.py
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def download_hf_snapshot(
    repo_id: str,
    model_dir: str,
    force_download: bool = False,
    revision: str | None = None,
) -> None:
    """Download only runtime-critical Hugging Face artifacts into a local model directory."""
    target_dir = Path(model_dir).expanduser().resolve()
    target_dir.mkdir(parents=True, exist_ok=True)
    prune_hf_runtime_artifacts(target_dir)
    LOGGER.info("Downloading runtime artifacts for %s.", repo_id)
    snapshot_download_fn = cast(_SnapshotDownloadCallable, snapshot_download)
    try:
        snapshot_download_fn(
            repo_id=repo_id,
            local_dir=str(target_dir),
            local_dir_use_symlinks=False,
            force_download=force_download,
            revision=revision,
            allow_patterns=list(HF_RUNTIME_ARTIFACT_PATTERNS),
        )
    except Exception as exc:
        prune_hf_runtime_artifacts(target_dir)
        artifact_gaps = _runtime_artifact_gaps(target_dir)
        if not artifact_gaps:
            LOGGER.warning(
                "Hugging Face download for %s raised %s after materialization completed.",
                repo_id,
                type(exc).__name__,
            )
            return
        raise ManagedModelDownloadError(
            repo_id,
            target_dir,
            artifact_gaps,
        ) from exc
    prune_hf_runtime_artifacts(target_dir)
    artifact_gaps = _runtime_artifact_gaps(target_dir)
    if artifact_gaps:
        raise ManagedModelDownloadError(repo_id, target_dir, artifact_gaps)

Direct optimized-native helper for built-in aliases.

This class is the low-level optimized-native entry point. It is best suited for direct model control in scripts that intentionally want to bypass the higher-level RuntimeClient surface.

Parameters:

Name Type Description Default
model_id str

Built-in optimized alias to load.

required
device str

Target device string such as "cuda:0" or "mps".

'cuda:0'
logging bool

Whether to collect runtime stats.

True
multimodality bool

Whether the runtime should plan for multimodal execution.

False
specialization_registry SpecializationRegistry | None

Optional specialization registry override.

None
resolver ModelResolver | None

Optional model resolver override.

None
Source code in src/ollm/inference.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
class Inference:
    """Direct optimized-native helper for built-in aliases.

    This class is the low-level optimized-native entry point. It is best suited
    for direct model control in scripts that intentionally want to bypass the
    higher-level ``RuntimeClient`` surface.

    Args:
        model_id (str): Built-in optimized alias to load.
        device (str): Target device string such as ``"cuda:0"`` or ``"mps"``.
        logging (bool): Whether to collect runtime stats.
        multimodality (bool): Whether the runtime should plan for multimodal
            execution.
        specialization_registry (SpecializationRegistry | None): Optional
            specialization registry override.
        resolver (ModelResolver | None): Optional model resolver override.
    """

    def __init__(
        self,
        model_id: str,
        device: str = "cuda:0",
        logging: bool = True,
        multimodality: bool = False,
        specialization_registry: SpecializationRegistry | None = None,
        resolver: ModelResolver | None = None,
    ):
        self.model_id = model_id
        self.model_reference = model_id
        self.optimized_model_id = model_id
        self.device = torch.device(device)
        self.multimodality = multimodality
        self.stats = Stats() if logging else None
        self._specialization_registry = (
            build_default_specialization_registry()
            if specialization_registry is None
            else specialization_registry
        )
        self._resolver = resolver or ModelResolver()
        self._specialization_pipeline = SpecializationPipeline()
        self._cache_factory = None
        self._apply_cpu_offload = None
        self._apply_gpu_offload = None
        self.loaded_resolved_model = None
        self.loaded_specialization_provider_id = None
        self.loaded_applied_specialization_pass_ids = ()

    def hf_download(self, model_dir: str, force_download: bool = False) -> None:
        """Download the built-in optimized alias into a local directory.

        Args:
            model_dir (str): Target local model directory.
            force_download (bool): Whether to force a fresh snapshot download.

        Raises:
            ValueError: Raised when the current ``optimized_model_id`` is not a
                built-in optimized alias.
        """
        entry = find_model_catalog_entry(self.optimized_model_id)
        if entry is None:
            raise ValueError(
                f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
            )
        download_hf_snapshot(entry.repo_id, model_dir, force_download=force_download)

    def ini_model(
        self, models_dir: str = "./models/", force_download: bool = False
    ) -> None:
        """Download if needed and then load the optimized-native runtime.

        Args:
            models_dir (str): Parent directory that will contain the managed
                model directory.
            force_download (bool): Whether to force a fresh snapshot download.

        Raises:
            ValueError: Raised when the current optimized alias is invalid or the
                local model directory cannot be prepared.
        """
        entry = find_model_catalog_entry(self.optimized_model_id)
        if entry is None:
            raise ValueError(
                f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
            )

        model_dir = Path(models_dir).expanduser().resolve() / entry.model_id
        if force_download or not model_dir.exists():
            self.hf_download(str(model_dir), force_download=force_download)
        prune_hf_runtime_artifacts(model_dir)
        if not hf_runtime_artifacts_complete(model_dir):
            self.hf_download(str(model_dir), force_download=force_download)
            prune_hf_runtime_artifacts(model_dir)

        self.load_model(str(model_dir))

    def load_model(self, model_dir: str) -> None:
        """Load an optimized-native runtime from a local directory.

        Args:
            model_dir (str): Local model directory for the optimized alias.

        Raises:
            ValueError: Raised when the path does not exist or the current
                optimized alias is invalid.
        """
        model_path = Path(model_dir).expanduser().resolve()
        if not model_path.exists() or not model_path.is_dir():
            raise ValueError(f"Model directory does not exist: {model_path}")
        entry = find_model_catalog_entry(self.optimized_model_id)
        if entry is None:
            raise ValueError(
                f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
            )
        self._load_optimized_model(
            model_path,
            ModelSourceKind.BUILTIN,
            self.model_reference,
            catalog_entry=entry,
        )

    def _load_optimized_model(
        self,
        model_path: Path,
        source_kind: ModelSourceKind,
        raw_reference: str,
        catalog_entry: ModelCatalogEntry | None = None,
    ) -> None:
        entry = catalog_entry
        if source_kind is ModelSourceKind.BUILTIN:
            entry = find_model_catalog_entry(self.optimized_model_id)
            if entry is None:
                raise ValueError(
                    f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
                )

        LOGGER.info("Loading optimized model from %s.", model_path)
        runtime_config = RuntimeConfig(
            model_reference=raw_reference,
            device=str(self.device),
            multimodal=self.multimodality,
            stats=self.stats is not None,
        )
        resolved_model = self._resolver.inspect_materialized_model(
            ModelReference.parse(raw_reference),
            model_path,
            source_kind=source_kind,
            repo_id=None if entry is None else entry.repo_id,
            revision=None,
            catalog_entry=entry,
        )
        specialization_match = self._specialization_registry.select(
            resolved_model, runtime_config
        )
        if specialization_match is None:
            raise ValueError(
                f"No optimized specialization provider is available for {self.model_id!r} at {model_path}"
            )
        planned_specialization = self._specialization_pipeline.plan(
            resolved_model,
            runtime_config,
            specialization_match.provider_id,
        )
        artifacts = self._specialization_registry.load(
            specialization_match.provider_id,
            resolved_model,
            runtime_config,
            self.stats,
        )
        applied_specialization = apply_specialization(
            planned_specialization,
            artifacts,
            runtime_config,
        )
        self.loaded_resolved_model = resolved_model
        self.loaded_specialization_provider_id = specialization_match.provider_id
        self.loaded_applied_specialization_pass_ids = (
            applied_specialization.applied_pass_ids
        )
        self.model = artifacts.model
        self.tokenizer = artifacts.tokenizer
        if artifacts.processor is None:
            if hasattr(self, "processor"):
                delattr(self, "processor")
        else:
            self.processor = artifacts.processor
        self._cache_factory = artifacts.create_cache
        self._apply_cpu_offload = artifacts.apply_cpu_offload
        self._apply_gpu_offload = artifacts.apply_gpu_offload

    def offload_layers_to_cpu(self, layers_num: int, policy: str = "prefix") -> None:
        """Apply CPU layer offload through the selected specialization.

        Args:
            layers_num (int): Number of layers to place on CPU.
            policy (str): Placement policy such as ``"prefix"``,
                ``"suffix"``, or ``"middle-band"``.

        Raises:
            ValueError: Raised when the model is not loaded or the selected
                specialization does not expose CPU offload support.
        """
        if self._apply_cpu_offload is None:
            raise ValueError(f"{self.model_id} does not support CPU layer offload")
        model = getattr(self, "model", None)
        if model is None:
            raise ValueError(f"{self.model_id} is not loaded")
        placement = plan_cpu_offload_placement(
            requested_layers=layers_num,
            total_layers=require_hidden_layer_count(model),
            policy=policy,
        )
        self._apply_cpu_offload(placement.layer_indices)

    def offload_layers_to_gpu_cpu(
        self, gpu_layers_num: int = 0, cpu_layers_num: int = 0
    ) -> None:
        """Apply mixed GPU/CPU layer placement when the specialization exposes it.

        Args:
            gpu_layers_num (int): Number of layers to keep on the accelerator.
            cpu_layers_num (int): Number of layers to move to CPU.

        Raises:
            ValueError: Raised when the specialization does not expose mixed
                placement support.
        """
        if gpu_layers_num == 0 and cpu_layers_num == 0:
            return
        if self._apply_gpu_offload is None:
            raise ValueError(f"{self.model_id} does not support GPU layer offload")
        self._apply_gpu_offload(gpu_layers_num, cpu_layers_num)

    def DiskCache(
        self,
        cache_dir: str = "./kvcache",
        cache_strategy: str | None = None,
        cache_lifecycle: str | None = None,
        cache_window_tokens: int | None = None,
    ) -> object | None:
        """Create the specialization-backed KV cache when supported.

        Args:
            cache_dir (str): Base cache directory for the cache instance.
            cache_strategy (str | None): Optional explicit KV strategy override.
            cache_lifecycle (str | None): Optional cache lifecycle override.
            cache_window_tokens (int | None): Optional sliding-window token
                budget override.

        Returns:
            object | None: Specialization-backed cache object, or ``None`` when
            the loaded specialization does not expose a cache factory.
        """
        if self._cache_factory is None:
            return None
        return self._cache_factory(
            Path(cache_dir).expanduser().resolve(),
            cache_strategy,
            cache_lifecycle,
            cache_window_tokens,
        )

hf_download

hf_download(
    model_dir: str, force_download: bool = False
) -> None

Download the built-in optimized alias into a local directory.

Parameters:

Name Type Description Default
model_dir str

Target local model directory.

required
force_download bool

Whether to force a fresh snapshot download.

False

Raises:

Type Description
ValueError

Raised when the current optimized_model_id is not a built-in optimized alias.

Source code in src/ollm/inference.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def hf_download(self, model_dir: str, force_download: bool = False) -> None:
    """Download the built-in optimized alias into a local directory.

    Args:
        model_dir (str): Target local model directory.
        force_download (bool): Whether to force a fresh snapshot download.

    Raises:
        ValueError: Raised when the current ``optimized_model_id`` is not a
            built-in optimized alias.
    """
    entry = find_model_catalog_entry(self.optimized_model_id)
    if entry is None:
        raise ValueError(
            f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
        )
    download_hf_snapshot(entry.repo_id, model_dir, force_download=force_download)

ini_model

ini_model(
    models_dir: str = "./models/",
    force_download: bool = False,
) -> None

Download if needed and then load the optimized-native runtime.

Parameters:

Name Type Description Default
models_dir str

Parent directory that will contain the managed model directory.

'./models/'
force_download bool

Whether to force a fresh snapshot download.

False

Raises:

Type Description
ValueError

Raised when the current optimized alias is invalid or the local model directory cannot be prepared.

Source code in src/ollm/inference.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def ini_model(
    self, models_dir: str = "./models/", force_download: bool = False
) -> None:
    """Download if needed and then load the optimized-native runtime.

    Args:
        models_dir (str): Parent directory that will contain the managed
            model directory.
        force_download (bool): Whether to force a fresh snapshot download.

    Raises:
        ValueError: Raised when the current optimized alias is invalid or the
            local model directory cannot be prepared.
    """
    entry = find_model_catalog_entry(self.optimized_model_id)
    if entry is None:
        raise ValueError(
            f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
        )

    model_dir = Path(models_dir).expanduser().resolve() / entry.model_id
    if force_download or not model_dir.exists():
        self.hf_download(str(model_dir), force_download=force_download)
    prune_hf_runtime_artifacts(model_dir)
    if not hf_runtime_artifacts_complete(model_dir):
        self.hf_download(str(model_dir), force_download=force_download)
        prune_hf_runtime_artifacts(model_dir)

    self.load_model(str(model_dir))

load_model

load_model(model_dir: str) -> None

Load an optimized-native runtime from a local directory.

Parameters:

Name Type Description Default
model_dir str

Local model directory for the optimized alias.

required

Raises:

Type Description
ValueError

Raised when the path does not exist or the current optimized alias is invalid.

Source code in src/ollm/inference.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def load_model(self, model_dir: str) -> None:
    """Load an optimized-native runtime from a local directory.

    Args:
        model_dir (str): Local model directory for the optimized alias.

    Raises:
        ValueError: Raised when the path does not exist or the current
            optimized alias is invalid.
    """
    model_path = Path(model_dir).expanduser().resolve()
    if not model_path.exists() or not model_path.is_dir():
        raise ValueError(f"Model directory does not exist: {model_path}")
    entry = find_model_catalog_entry(self.optimized_model_id)
    if entry is None:
        raise ValueError(
            f"Inference only supports built-in optimized aliases. Received {self.optimized_model_id!r}."
        )
    self._load_optimized_model(
        model_path,
        ModelSourceKind.BUILTIN,
        self.model_reference,
        catalog_entry=entry,
    )

offload_layers_to_cpu

offload_layers_to_cpu(
    layers_num: int, policy: str = "prefix"
) -> None

Apply CPU layer offload through the selected specialization.

Parameters:

Name Type Description Default
layers_num int

Number of layers to place on CPU.

required
policy str

Placement policy such as "prefix", "suffix", or "middle-band".

'prefix'

Raises:

Type Description
ValueError

Raised when the model is not loaded or the selected specialization does not expose CPU offload support.

Source code in src/ollm/inference.py
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
def offload_layers_to_cpu(self, layers_num: int, policy: str = "prefix") -> None:
    """Apply CPU layer offload through the selected specialization.

    Args:
        layers_num (int): Number of layers to place on CPU.
        policy (str): Placement policy such as ``"prefix"``,
            ``"suffix"``, or ``"middle-band"``.

    Raises:
        ValueError: Raised when the model is not loaded or the selected
            specialization does not expose CPU offload support.
    """
    if self._apply_cpu_offload is None:
        raise ValueError(f"{self.model_id} does not support CPU layer offload")
    model = getattr(self, "model", None)
    if model is None:
        raise ValueError(f"{self.model_id} is not loaded")
    placement = plan_cpu_offload_placement(
        requested_layers=layers_num,
        total_layers=require_hidden_layer_count(model),
        policy=policy,
    )
    self._apply_cpu_offload(placement.layer_indices)

offload_layers_to_gpu_cpu

offload_layers_to_gpu_cpu(
    gpu_layers_num: int = 0, cpu_layers_num: int = 0
) -> None

Apply mixed GPU/CPU layer placement when the specialization exposes it.

Parameters:

Name Type Description Default
gpu_layers_num int

Number of layers to keep on the accelerator.

0
cpu_layers_num int

Number of layers to move to CPU.

0

Raises:

Type Description
ValueError

Raised when the specialization does not expose mixed placement support.

Source code in src/ollm/inference.py
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
def offload_layers_to_gpu_cpu(
    self, gpu_layers_num: int = 0, cpu_layers_num: int = 0
) -> None:
    """Apply mixed GPU/CPU layer placement when the specialization exposes it.

    Args:
        gpu_layers_num (int): Number of layers to keep on the accelerator.
        cpu_layers_num (int): Number of layers to move to CPU.

    Raises:
        ValueError: Raised when the specialization does not expose mixed
            placement support.
    """
    if gpu_layers_num == 0 and cpu_layers_num == 0:
        return
    if self._apply_gpu_offload is None:
        raise ValueError(f"{self.model_id} does not support GPU layer offload")
    self._apply_gpu_offload(gpu_layers_num, cpu_layers_num)

DiskCache

DiskCache(
    cache_dir: str = "./kvcache",
    cache_strategy: str | None = None,
    cache_lifecycle: str | None = None,
    cache_window_tokens: int | None = None,
) -> object | None

Create the specialization-backed KV cache when supported.

Parameters:

Name Type Description Default
cache_dir str

Base cache directory for the cache instance.

'./kvcache'
cache_strategy str | None

Optional explicit KV strategy override.

None
cache_lifecycle str | None

Optional cache lifecycle override.

None
cache_window_tokens int | None

Optional sliding-window token budget override.

None

Returns:

Type Description
object | None

object | None: Specialization-backed cache object, or None when

object | None

the loaded specialization does not expose a cache factory.

Source code in src/ollm/inference.py
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
def DiskCache(
    self,
    cache_dir: str = "./kvcache",
    cache_strategy: str | None = None,
    cache_lifecycle: str | None = None,
    cache_window_tokens: int | None = None,
) -> object | None:
    """Create the specialization-backed KV cache when supported.

    Args:
        cache_dir (str): Base cache directory for the cache instance.
        cache_strategy (str | None): Optional explicit KV strategy override.
        cache_lifecycle (str | None): Optional cache lifecycle override.
        cache_window_tokens (int | None): Optional sliding-window token
            budget override.

    Returns:
        object | None: Specialization-backed cache object, or ``None`` when
        the loaded specialization does not expose a cache factory.
    """
    if self._cache_factory is None:
        return None
    return self._cache_factory(
        Path(cache_dir).expanduser().resolve(),
        cache_strategy,
        cache_lifecycle,
        cache_window_tokens,
    )

Bases: Inference

Optimized-native helper for compatible local model directories.

AutoInference inspects a local model directory, infers the matching optimized-native family, and then loads the same optimized path that Inference uses for built-in aliases.

Parameters:

Name Type Description Default
model_dir str

Local model directory to inspect and load.

required
adapter_dir str | None

Optional LoRA adapter directory.

None
device str

Target device string such as "cuda:0" or "mps".

'cuda:0'
logging bool

Whether to collect runtime stats.

True
multimodality bool

Whether the runtime should plan for multimodal execution.

False
specialization_registry SpecializationRegistry | None

Optional specialization registry override.

None
resolver ModelResolver | None

Optional model resolver override.

None

Raises:

Type Description
ValueError

Raised when the local model directory is missing, uses an unsupported architecture, or references an invalid adapter directory.

Source code in src/ollm/inference.py
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
class AutoInference(Inference):
    """Optimized-native helper for compatible local model directories.

    ``AutoInference`` inspects a local model directory, infers the matching
    optimized-native family, and then loads the same optimized path that
    ``Inference`` uses for built-in aliases.

    Args:
        model_dir (str): Local model directory to inspect and load.
        adapter_dir (str | None): Optional LoRA adapter directory.
        device (str): Target device string such as ``"cuda:0"`` or ``"mps"``.
        logging (bool): Whether to collect runtime stats.
        multimodality (bool): Whether the runtime should plan for multimodal
            execution.
        specialization_registry (SpecializationRegistry | None): Optional
            specialization registry override.
        resolver (ModelResolver | None): Optional model resolver override.

    Raises:
        ValueError: Raised when the local model directory is missing, uses an
            unsupported architecture, or references an invalid adapter
            directory.
    """

    def __init__(
        self,
        model_dir: str,
        adapter_dir: str | None = None,
        device: str = "cuda:0",
        logging: bool = True,
        multimodality: bool = False,
        specialization_registry: SpecializationRegistry | None = None,
        resolver: ModelResolver | None = None,
    ):
        model_path = Path(model_dir).expanduser().resolve()
        if not model_path.exists() or not model_path.is_dir():
            raise ValueError(f"Local model directory does not exist: {model_path}")

        config = AutoConfig.from_pretrained(model_path)
        architectures = getattr(config, "architectures", None) or ()
        architecture = architectures[0] if architectures else None
        if architecture == "LlamaForCausalLM":
            optimized_model_id = "llama3-1B-chat"
        elif architecture in {"Gemma3ForConditionalGeneration", "Gemma3ForCausalLM"}:
            optimized_model_id = "gemma3-12B"
        else:
            raise ValueError(
                f"The current optimized path cannot run architecture {architecture!r}. "
                "Use a built-in optimized alias or a compatible local Llama/Gemma3 model directory."
            )

        super().__init__(
            str(model_path),
            device=device,
            logging=logging,
            multimodality=multimodality,
            specialization_registry=specialization_registry,
            resolver=resolver,
        )
        self.optimized_model_id = optimized_model_id
        self.model_reference = str(model_path)
        self._load_optimized_model(
            model_path,
            ModelSourceKind.LOCAL_PATH,
            self.model_reference,
        )
        if adapter_dir:
            adapter_path = Path(adapter_dir).expanduser().resolve()
            if not adapter_path.exists() or not adapter_path.is_dir():
                raise ValueError(f"Adapter directory does not exist: {adapter_path}")
            validate_safe_adapter_artifacts(adapter_path)
            lora_config_cls, get_peft_model = _load_peft_symbols()
            peft_config = lora_config_cls.from_pretrained(str(adapter_path))
            peft_model = cast(
                _PeftModelProtocol, get_peft_model(self.model, peft_config)
            )
            peft_model.load_adapter(
                str(adapter_path), adapter_name="default", use_safetensors=True
            )
            self.model = peft_model