Spaces:
Sleeping
Sleeping
| from datasets import load_dataset, concatenate_datasets | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer | |
| import torch | |
| print("🚀 Lancement de l'entraînement de Gopu Agent...") | |
| # Charger plusieurs datasets open source | |
| ds1 = load_dataset("OpenAssistant/oasst1", split="train[:2%]") | |
| ds2 = load_dataset("databricks/databricks-dolly-15k", split="train[:10%]") | |
| ds3 = load_dataset("lvwerra/stack-exchange-paired", split="train[:5%]") | |
| dataset = concatenate_datasets([ds1, ds2, ds3]) | |
| # Modèle de base | |
| model_name = "tiiuae/falcon-7b-instruct" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) | |
| # Prétraitement | |
| def preprocess(example): | |
| text = example.get("text") or example.get("instruction") or "" | |
| response = example.get("response") or "" | |
| return tokenizer(text + "\n" + response, truncation=True, padding="max_length", max_length=512) | |
| dataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names) | |
| # Configuration entraînement | |
| training_args = TrainingArguments( | |
| output_dir="./output", | |
| per_device_train_batch_size=2, | |
| num_train_epochs=1, | |
| save_steps=1000, | |
| learning_rate=2e-5, | |
| logging_dir="./logs", | |
| push_to_hub=True, | |
| hub_model_id="tonusername/gopu-agent", | |
| ) | |
| trainer = Trainer(model=model, args=training_args, train_dataset=dataset) | |
| trainer.train() | |
| print("✅ Entraînement terminé, publication sur Hugging Face Hub...") | |
| trainer.push_to_hub() | |