import warnings from loguru import logger from numpy import ndarray from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import ( accuracy_score, classification_report, f1_score, precision_score, recall_score, ) from sklearn.model_selection import GridSearchCV from sklearn.multioutput import MultiOutputClassifier from sklearn.pipeline import Pipeline from ..baseModel import BaseModel warnings.filterwarnings("ignore") class RandomForestTfIdf(BaseModel): """ Sklearn implementation of BaseModel with integrated Grid Search. Builds a TF-IDF + RandomForest pipeline for multi-output text classification. """ def __init__(self, language, path=None): """ Initialize the RandomForestTfIdf model with configuration parameters. Args: language (str): Language for the model. path (str, optional): Path to load a pre-trained model. Defaults to None. If None, a new model is initialized. """ self.params = {"stop_words": "english", "random_state": 42, "cv_folds": 5} self.grid_params = { "clf__estimator__n_estimators": [50, 100, 200], "clf__estimator__max_depth": [None, 10, 20], "tfidf__max_features": [3000, 5000, 8000], } super().__init__(language, path) def setup_model(self): """ Initialize the scikit-learn pipeline with TF-IDF vectorizer and RandomForest classifier. """ base_estimator = RandomForestClassifier( random_state=self.params["random_state"], n_jobs=-1 ) self.pipeline = Pipeline( [ ( "tfidf", TfidfVectorizer(ngram_range=(1, 2), stop_words=self.params["stop_words"]), ), ("clf", MultiOutputClassifier(base_estimator, n_jobs=-1)), ] ) self.model = self.pipeline logger.info("Scikit-learn pipeline initialized.") def train(self, X_train, y_train) -> dict[str, any]: """ Train the model using Grid Search to find the best hyperparameters. Args: X_train: Input training data. y_train: True labels for training data. """ if self.model is None: raise ValueError( "Model pipeline is not initialized. Call setup_model() before training." ) logger.info(f"Starting training for: {self.language.upper()}") logger.info("Performing Grid Search for best hyperparameters...") grid_search = GridSearchCV( self.pipeline, param_grid=self.grid_params, cv=self.params["cv_folds"], scoring="f1_weighted", n_jobs=-1, verbose=1, ) grid_search.fit(X_train, y_train) logger.success(f"Best params found: {grid_search.best_params_}") parameters_to_log = { "max_features": grid_search.best_params_["tfidf__max_features"], "n_estimators": grid_search.best_params_["clf__estimator__n_estimators"], "max_depth": grid_search.best_params_["clf__estimator__max_depth"], } self.model = grid_search.best_estimator_ logger.success(f"Training for {self.language.upper()} completed.") return parameters_to_log def evaluate(self, X_test, y_test) -> dict[str, any]: """ Evaluate model on test data and return metrics. Args: X_test: Input test data. y_test: True labels for test data. """ y_pred = self.predict(X_test) report = classification_report(y_test, y_pred, zero_division=0) print("\n" + "=" * 50) print("CLASSIFICATION REPORT") print(report) print("=" * 50 + "\n") metrics = { "accuracy": accuracy_score(y_test, y_pred), "precision": precision_score(y_test, y_pred, average="macro", zero_division=0), "recall": recall_score(y_test, y_pred, average="macro", zero_division=0), "f1_score": f1_score(y_test, y_pred, average="weighted"), } logger.info( f"Evaluation completed — Accuracy: {metrics['accuracy']:.3f}, F1: {metrics['f1_score']:.3f}" ) return metrics def predict(self, X) -> ndarray: """ Make predictions using the trained model. Args: X: Input data for prediction. Returns: Predictions made by the model. """ if self.model is None: raise ValueError("Model is not trained. Call train() or load() before prediction.") return self.model.predict(X)