| import os | |
| from transformers import AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", token=os.environ.get("HF_TOKEN")) | |
| preprompt = "Focus on numbers" | |
| text = "The 86 billion neurons." | |
| full = f"{preprompt}\n\n{text}" | |
| p_tok = tokenizer(preprompt + "\n\n")["input_ids"] | |
| f_tok = tokenizer(full)["input_ids"] | |
| print(f"Preprompt tokens: {len(p_tok)}") | |
| print(f"Full tokens: {len(f_tok)}") | |
| print("Preprompt tokens:", tokenizer.convert_ids_to_tokens(p_tok)) | |
| print("Full tokens:", tokenizer.convert_ids_to_tokens(f_tok)) | |