from transformers import AutoTokenizer, AutoModelForCausalLM import torch from torchao.quantization import quantize_, int8_weight_only model_name = "swiss-ai/Apertus-8B-Instruct-2509" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) quantize_(model, int8_weight_only()) model.to("cuda") print("Enter your prompt:") input_text = input() inputs = tokenizer.encode(input_text, return_tensors='pt').to("cuda") import time start_time = time.time() with torch.no_grad(): outputs = model.generate(inputs, max_length=5000) end_time = time.time() print(f"Quantized inference time: {end_time - start_time:.2f} seconds") print(f"Generated text: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")