from transformers import AutoTokenizer, AutoModelForCausalLM
import tokenizer
from torchao.quantization import quantize_, int8_weight_only

model_name = "swiss-ai/Apertus-8B-2509"
device = "cuda"  # for GPU usage or "cpu" for CPU usage

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
).to(device)

quantize_(model, int8_weight_only())

# prepare the model input
print("Please enter the prompt you want to ask the cool AI")
prompt = input()
messages_think = [
    {"role": "user", "content": prompt}
]

example_template = """
{% for message in messages %}
<|start|>{{ message.role }}<|sep|>
{{ message.content }}
<|end|>
{% endfor %}
"""

text = tokenizer.apply_chat_template(
    messages_think,
    chat_template=example_template,
    tokenize=False,
    add_generation_prompt=True,
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# Generate the output
generated_ids = model.generate(**model_inputs, max_new_tokens=32768)

# Get and decode the output
output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :]
print(tokenizer.decode(output_ids, skip_special_tokens=True))