File size: 4,878 Bytes
b805416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
from torch.utils.tensorboard import SummaryWriter
from tabulate import tabulate

# Configure logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s", 
    level=logging.INFO
)

# Define paths
model_path = "model/logistic_model.joblib"
vectorizer_path = "model/vectorizer.joblib"
log_dir = "logs"
metrics_log_path = os.path.join(log_dir, "metrics_log.txt")

# Create directories if they don't exist
os.makedirs("model", exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

# Initialize TensorBoard writer
writer = SummaryWriter(log_dir=log_dir)

# Step 1: Load and preprocess dataset
file_path = "dataset/rwandan_names.csv"
logging.info("Loading dataset from file...")
data = pd.read_csv(file_path)
logging.info("Dataset loaded. Checking for missing values...")

# Drop rows with NaN values in 'name' or 'gender' columns
data = data.dropna(subset=['name', 'gender'])
logging.info(f"Dataset loaded with {len(data)} records after dropping NaNs.")

# Step 2: Split dataset into training, validation, and test sets
logging.info("Splitting dataset into training, validation, and test sets...")
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
logging.info(f"Training set size: {len(train_data)}")
logging.info(f"Validation set size: {len(val_data)}")
logging.info(f"Test set size: {len(test_data)}")

# Step 3: Initialize vectorizer and transform names
logging.info("Initializing vectorizer and transforming training data...")
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))
X_train = vectorizer.fit_transform(train_data['name'])
logging.info("Training data transformed.")

logging.info("Transforming validation and test data...")
X_val = vectorizer.transform(val_data['name'])
X_test = vectorizer.transform(test_data['name'])
logging.info("Validation and test data transformation complete.")

# Encode target variables
y_train = train_data['gender']
y_val = val_data['gender']
y_test = test_data['gender']

# Initialize and train the model
logging.info("Initializing and training the Logistic Regression model...")
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
logging.info("Model training complete.")

# Store metrics for final table display
metrics_summary = []

# Define a function to calculate and log metrics
def calculate_metrics(y_true, y_pred, dataset_type="Validation", step=0):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label="female", average="binary")
    recall = recall_score(y_true, y_pred, pos_label="female", average="binary")
    f1 = f1_score(y_true, y_pred, pos_label="female", average="binary")
    
    # Log to TensorBoard with a specified step
    writer.add_scalar(f"{dataset_type}/Accuracy", accuracy, step)
    writer.add_scalar(f"{dataset_type}/Precision", precision, step)
    writer.add_scalar(f"{dataset_type}/Recall", recall, step)
    writer.add_scalar(f"{dataset_type}/F1-score", f1, step)

    # Append metrics to summary list for table display
    metrics_summary.append([dataset_type, accuracy, precision, recall, f1])

    # Print and save metrics to file
    with open(metrics_log_path, "a") as log_file:
        log_file.write(f"\n{dataset_type} Metrics:\n")
        log_file.write(f"Accuracy: {accuracy:.4f}\nPrecision: {precision:.4f}\nRecall: {recall:.4f}\nF1-score: {f1:.4f}\n")
    
    logging.info(f"{dataset_type} Metrics - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

# Step 5: Calculate and log metrics for validation and test sets
logging.info("Calculating metrics for validation set...")
y_val_pred = model.predict(X_val)
calculate_metrics(y_val, y_val_pred, dataset_type="Validation", step=1)

logging.info("Calculating metrics for test set...")
y_test_pred = model.predict(X_test)
calculate_metrics(y_test, y_test_pred, dataset_type="Test", step=2)

# Step 6: Save model and vectorizer
logging.info("Saving model and vectorizer to disk...")
joblib.dump(model, model_path)
joblib.dump(vectorizer, vectorizer_path)
logging.info(f"Model saved to {model_path}")
logging.info(f"Vectorizer saved to {vectorizer_path}")
logging.info(f"Metrics logged to {metrics_log_path}")

# Print summary table
print("\nFinal Metrics Summary:")
print(tabulate(metrics_summary, headers=["Dataset", "Accuracy", "Precision", "Recall", "F1-Score"], floatfmt=".4f"))

# Close TensorBoard writer
writer.close()
logging.info("Training and logging completed.")