from datasets import load_dataset, concatenate_datasets from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer import torch print("🚀 Lancement de l'entraînement de Gopu Agent...") # Charger plusieurs datasets open source ds1 = load_dataset("OpenAssistant/oasst1", split="train[:2%]") ds2 = load_dataset("databricks/databricks-dolly-15k", split="train[:10%]") ds3 = load_dataset("lvwerra/stack-exchange-paired", split="train[:5%]") dataset = concatenate_datasets([ds1, ds2, ds3]) # Modèle de base model_name = "tiiuae/falcon-7b-instruct" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) # Prétraitement def preprocess(example): text = example.get("text") or example.get("instruction") or "" response = example.get("response") or "" return tokenizer(text + "\n" + response, truncation=True, padding="max_length", max_length=512) dataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names) # Configuration entraînement training_args = TrainingArguments( output_dir="./output", per_device_train_batch_size=2, num_train_epochs=1, save_steps=1000, learning_rate=2e-5, logging_dir="./logs", push_to_hub=True, hub_model_id="tonusername/gopu-agent", ) trainer = Trainer(model=model, args=training_args, train_dataset=dataset) trainer.train() print("✅ Entraînement terminé, publication sur Hugging Face Hub...") trainer.push_to_hub()