This commit is contained in:
Michelle Winkler
2025-09-24 10:02:43 +02:00
commit 23691ba663
2 changed files with 25 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
venv/

24
app.py Normal file
View File

@@ -0,0 +1,24 @@
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from torchao.quantization import quantize_, int8_weight_only
model_name = "swiss-ai/Apertus-8B-Instruct-2509"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
quantize_(model, int8_weight_only())
model.to("cuda")
print("Enter your prompt:")
input_text = input()
inputs = tokenizer.encode(input_text, return_tensors='pt').to("cuda")
import time
start_time = time.time()
with torch.no_grad():
outputs = model.generate(inputs, max_length=5000)
end_time = time.time()
print(f"Quantized inference time: {end_time - start_time:.2f} seconds")
print(f"Generated text: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")