Use actual template from Apertus

2025-09-24 11:43:06 +02:00
parent 23691ba663
commit bc12eb1bc1
1 changed files with 34 additions and 14 deletions
--- a/app.py
+++ b/app.py
@@ -1,24 +1,44 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch
+import tokenizer
 from torchao.quantization import quantize_, int8_weight_only

-model_name = "swiss-ai/Apertus-8B-Instruct-2509"
+model_name = "swiss-ai/Apertus-8B-2509"
+device = "cuda"  # for GPU usage or "cpu" for CPU usage
+
+# load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+).to(device)

 quantize_(model, int8_weight_only())
-model.to("cuda")

-print("Enter your prompt:")
-input_text = input()
-inputs = tokenizer.encode(input_text, return_tensors='pt').to("cuda")
+# prepare the model input
+print("Please enter the prompt you want to ask the cool AI")
+prompt = input()
+messages_think = [
+    {"role": "user", "content": prompt}
+]

-import time
-start_time = time.time()
-with torch.no_grad():
-    outputs = model.generate(inputs, max_length=5000)
+example_template = """
+{% for message in messages %}
+<|start|>{{ message.role }}<|sep|>
+{{ message.content }}
+<|end|>
+{% endfor %}
+"""

-end_time = time.time()
+text = tokenizer.apply_chat_template(
+    messages_think,
+    chat_template=example_template,
+    tokenize=False,
+    add_generation_prompt=True,
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

-print(f"Quantized inference time: {end_time - start_time:.2f} seconds")
-print(f"Generated text: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
+# Generate the output
+generated_ids = model.generate(**model_inputs, max_new_tokens=32768)
+
+# Get and decode the output
+output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :]
+print(tokenizer.decode(output_ids, skip_special_tokens=True))