from sklearn.preprocessing import LabelEncoderle = LabelEncoder()from sklearn.preprocessing import StandardScalerdef prepare_df(df): y = df["genre"] y = le.fit_transform(y) X = df.drop(columns=["genre"])# Use StandardScaler for more stable training scaler = StandardScaler() X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)return X, y
import pandas as pdimport torchfrom torch.utils.data import DataLoader, TensorDatasetimport torch.nn as nnimport torch.optim as optimfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import LabelEncoder, MinMaxScalerfrom imblearn.over_sampling import SMOTE# Load the dataurl ="https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"df = pd.read_csv(url)# Select only the engineered featuresengineered_features = ['dating', 'violence', 'world/life', 'night/time', 'shake the audience', 'family/gospel', 'romantic', 'communication', 'obscene', 'music', 'movement/places', 'light/visual perceptions', 'family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability', 'loudness', 'acousticness', 'instrumentalness', 'valence', 'energy', 'genre']df_ef = df[engineered_features]# Encode the target labelsle = LabelEncoder()df_ef.loc[:, "genre"] = le.fit_transform(df_ef["genre"])# Separate features and labelsX = df_ef.drop(columns=["genre"]).valuesy = df_ef["genre"].values# Use MinMax scaling for more consistent gradientsscaler = MinMaxScaler()X = scaler.fit_transform(X)# Oversample the minority classessmote = SMOTE(random_state=42)X, y = smote.fit_resample(X, y)# Train-test splitX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)# Convert to PyTorch tensorsX_train_tensor = torch.tensor(X_train, dtype=torch.float32)y_train_tensor = torch.tensor(y_train, dtype=torch.long)X_val_tensor = torch.tensor(X_test, dtype=torch.float32)y_val_tensor = torch.tensor(y_test, dtype=torch.long)# Create DataLoader for training and validation setstrain_dataset = TensorDataset(X_train_tensor, y_train_tensor)val_dataset = TensorDataset(X_val_tensor, y_val_tensor)train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)# Define the DNN model with more aggressive regularizationclass DeepDNN(nn.Module):def__init__(self, input_size, num_classes):super(DeepDNN, self).__init__()self.layers = nn.Sequential( nn.Linear(input_size, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.4), nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.4), nn.Linear(256, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.4), nn.Linear(128, 64), nn.BatchNorm1d(64), nn.ReLU(), nn.Dropout(0.4), nn.Linear(64, num_classes) )def forward(self, x):returnself.layers(x)# Initialize the model, loss function, and optimizerinput_size = X_train.shape[1]num_classes =len(le.classes_)model = DeepDNN(input_size, num_classes)criterion = nn.CrossEntropyLoss()optimizer = optim.AdamW(model.parameters(), lr=0.002, weight_decay=1e-4)# Use a more aggressive learning rate schedulerscheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30)# Training loop with early stoppingnum_epochs =100best_val_acc =0patience =10early_stop_counter =0for epoch inrange(num_epochs): model.train() train_loss =0for X_batch, y_batch in train_loader: optimizer.zero_grad() outputs = model(X_batch) loss = criterion(outputs, y_batch) loss.backward() optimizer.step() train_loss += loss.item()# Validation model.eval() val_loss =0 correct =0 total =0with torch.no_grad():for X_batch, y_batch in val_loader: outputs = model(X_batch) loss = criterion(outputs, y_batch) val_loss += loss.item() _, predicted = torch.max(outputs, 1) total += y_batch.size(0) correct += (predicted == y_batch).sum().item() val_acc =100* correct / total scheduler.step()print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, "f"Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {val_acc:.2f}%")# Early stoppingif val_acc > best_val_acc: best_val_acc = val_acc early_stop_counter =0print(f"New best validation accuracy: {val_acc:.2f}%")else: early_stop_counter +=1if early_stop_counter >= patience:print("Early stopping triggered.")breakprint(f"Best Validation Accuracy: {best_val_acc:.2f}%")
---------------------------------------------------------------------------ValueError Traceback (most recent call last)
Cell In[1], line 35 33# Oversample the minority classes 34 smote = SMOTE(random_state=42)
---> 35 X, y =smote.fit_resample(X,y) 37# Train-test split 38 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
File ~/miniforge3/envs/pytorch/lib/python3.12/site-packages/imblearn/base.py:202, in BaseSampler.fit_resample(self, X, y, **params) 181deffit_resample(self, X, y, **params):
182"""Resample the dataset. 183 184 Parameters (...) 200 The corresponding label of `X_resampled`. 201 """--> 202returnsuper().fit_resample(X,y,**params)
File ~/miniforge3/envs/pytorch/lib/python3.12/site-packages/sklearn/base.py:1389, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs) 1382 estimator._validate_params()
1384with config_context(
1385 skip_parameter_validation=(
1386 prefer_skip_nested_validation or global_skip_validation
1387 )
1388 ):
-> 1389returnfit_method(estimator,*args,**kwargs)
File ~/miniforge3/envs/pytorch/lib/python3.12/site-packages/imblearn/base.py:97, in SamplerMixin.fit_resample(self, X, y, **params) 72@_fit_context(prefer_skip_nested_validation=True)
73deffit_resample(self, X, y, **params):
74"""Resample the dataset. 75 76 Parameters (...) 95 The corresponding label of `X_resampled`. 96 """---> 97check_classification_targets(y) 98 arrays_transformer = ArraysTransformer(X, y)
99 X, y, binarize_y =self._check_X_y(X, y)
File ~/miniforge3/envs/pytorch/lib/python3.12/site-packages/sklearn/utils/multiclass.py:222, in check_classification_targets(y) 214 y_type = type_of_target(y, input_name="y")
215if y_type notin [
216"binary",
217"multiclass",
(...) 220"multilabel-sequences",
221 ]:
--> 222raiseValueError(
223f"Unknown label type: {y_type}. Maybe you are trying to fit a " 224"classifier, which expects discrete classes on a " 225"regression target with continuous values." 226 )
ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.
num_classes =len(le.classes_)print("Number of classes:", num_classes)
print(pd.Series(y_train).value_counts())
!pip install imblearn
Requirement already satisfied: imblearn in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (0.0)
Requirement already satisfied: imbalanced-learn in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imblearn) (0.13.0)
Requirement already satisfied: numpy<3,>=1.24.3 in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imbalanced-learn->imblearn) (2.2.4)
Requirement already satisfied: scipy<2,>=1.10.1 in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imbalanced-learn->imblearn) (1.15.2)
Requirement already satisfied: scikit-learn<2,>=1.3.2 in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imbalanced-learn->imblearn) (1.6.1)
Requirement already satisfied: sklearn-compat<1,>=0.1 in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imbalanced-learn->imblearn) (0.1.3)
Requirement already satisfied: joblib<2,>=1.1.1 in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imbalanced-learn->imblearn) (1.4.2)
Requirement already satisfied: threadpoolctl<4,>=2.0.0 in /Users/yahyarahhawi/miniforge3/envs/pytorch/lib/python3.12/site-packages (from imbalanced-learn->imblearn) (3.6.0)