import pandas as pd

url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"
df = pd.read_csv(url)
df.head()
df["genre"].unique()
engineered_features = ['dating', 'violence', 'world/life', 'night/time','shake the audience','family/gospel', 'romantic', 'communication','obscene', 'music', 'movement/places', 'light/visual perceptions','family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability','loudness', 'acousticness', 'instrumentalness', 'valence', 'energy', "genre"]      
df_ef = df[engineered_features]
df_lyrics = df[['lyrics', 'genre']]
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

from sklearn.preprocessing import StandardScaler

def prepare_df(df):
    y = df["genre"]
    y = le.fit_transform(y)
    X = df.drop(columns=["genre"])

    # Use StandardScaler for more stable training
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X, y
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from imblearn.over_sampling import SMOTE

# Load the data
url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"
df = pd.read_csv(url)

# Select only the engineered features
engineered_features = ['dating', 'violence', 'world/life', 'night/time', 'shake the audience', 'family/gospel', 
                       'romantic', 'communication', 'obscene', 'music', 'movement/places', 
                       'light/visual perceptions', 'family/spiritual', 'like/girls', 'sadness', 
                       'feelings', 'danceability', 'loudness', 'acousticness', 'instrumentalness', 
                       'valence', 'energy', 'genre']
df_ef = df[engineered_features]

# Encode the target labels
le = LabelEncoder()
df_ef.loc[:, "genre"] = le.fit_transform(df_ef["genre"])
# Separate features and labels
X = df_ef.drop(columns=["genre"]).values
y = df_ef["genre"].values

# Use MinMax scaling for more consistent gradients
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Oversample the minority classes
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_test, dtype=torch.float32)
y_val_tensor = torch.tensor(y_test, dtype=torch.long)

# Create DataLoader for training and validation sets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

# Define the DNN model with more aggressive regularization
class DeepDNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(DeepDNN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.4),
            
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.4),
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.4),
            
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        return self.layers(x)

# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]
num_classes = len(le.classes_)
model = DeepDNN(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.002, weight_decay=1e-4)

# Use a more aggressive learning rate scheduler
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30)

# Training loop with early stopping
num_epochs = 100
best_val_acc = 0
patience = 10
early_stop_counter = 0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    val_acc = 100 * correct / total
    scheduler.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, "
          f"Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {val_acc:.2f}%")

    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        early_stop_counter = 0
        print(f"New best validation accuracy: {val_acc:.2f}%")
    else:
        early_stop_counter += 1

    if early_stop_counter >= patience:
        print("Early stopping triggered.")
        break

print(f"Best Validation Accuracy: {best_val_acc:.2f}%")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 35
     33 # Oversample the minority classes
     34 smote = SMOTE(random_state=42)
---> 35 X, y = smote.fit_resample(X, y)
     37 # Train-test split
     38 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

File ~/miniforge3/envs/pytorch/lib/python3.12/site-packages/imblearn/base.py:202, in BaseSampler.fit_resample(self, X, y, **params)
    181 def fit_resample(self, X, y, **params):
    182     """Resample the dataset.
    183 
    184     Parameters
   (...)
    200         The corresponding label of `X_resampled`.
    201     """
--> 202     return super().fit_resample(X, y, **params)

File ~/miniforge3/envs/pytorch/lib/python3.12/site-packages/sklearn/base.py:1389, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1382     estimator._validate_params()
   1384 with config_context(
   1385     skip_parameter_validation=(
   1386         prefer_skip_nested_validation or global_skip_validation
   1387     )
   1388 ):
-> 1389     return fit_method(estimator, *args, **kwargs)

File ~/miniforge3/envs/pytorch/lib/python3.12/site-packages/imblearn/base.py:97, in SamplerMixin.fit_resample(self, X, y, **params)
     72 @_fit_context(prefer_skip_nested_validation=True)
     73 def fit_resample(self, X, y, **params):
     74     """Resample the dataset.
     75 
     76     Parameters
   (...)
     95         The corresponding label of `X_resampled`.
     96     """
---> 97     check_classification_targets(y)
     98     arrays_transformer = ArraysTransformer(X, y)
     99     X, y, binarize_y = self._check_X_y(X, y)

File ~/miniforge3/envs/pytorch/lib/python3.12/site-packages/sklearn/utils/multiclass.py:222, in check_classification_targets(y)
    214 y_type = type_of_target(y, input_name="y")
    215 if y_type not in [
    216     "binary",
    217     "multiclass",
   (...)
    220     "multilabel-sequences",
    221 ]:
--> 222     raise ValueError(
    223         f"Unknown label type: {y_type}. Maybe you are trying to fit a "
    224         "classifier, which expects discrete classes on a "
    225         "regression target with continuous values."
    226     )

ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.
num_classes = len(le.classes_)
print("Number of classes:", num_classes)
print(pd.Series(y_train).value_counts())
!pip install imblearn
Requirement already satisfied: imblearn in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (0.0)
Requirement already satisfied: imbalanced-learn in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imblearn) (0.13.0)
Requirement already satisfied: numpy<3,>=1.24.3 in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imbalanced-learn->imblearn) (2.2.4)
Requirement already satisfied: scipy<2,>=1.10.1 in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imbalanced-learn->imblearn) (1.15.2)
Requirement already satisfied: scikit-learn<2,>=1.3.2 in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imbalanced-learn->imblearn) (1.6.1)
Requirement already satisfied: sklearn-compat<1,>=0.1 in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imbalanced-learn->imblearn) (0.1.3)
Requirement already satisfied: joblib<2,>=1.1.1 in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imbalanced-learn->imblearn) (1.4.2)
Requirement already satisfied: threadpoolctl<4,>=2.0.0 in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imbalanced-learn->imblearn) (3.6.0)