```
# Define columns (You will need to adjust these based on the exact CICIDS 2017 headers)
metadata_cols = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
port_cols = ['Src Port', 'Dst Port']
# Drop metadata and target label from X
X_raw = df_sample.drop(columns=metadata_cols ['Label'], errors='ignore')
y_raw = df_sample['Label']
# Clean Infinities directly on the dataframe copy
X_raw = X_raw.replace([np.inf, -np.inf], np.nan)
# Encode well-known ports directly on the dataframe (< 1024)
for col in port_cols:
if col in X_raw.columns:
X_raw[col] = (X_raw[col] < 1024).astype(int)
# Separate continuous numeric columns from categorical numeric columns (Ports)
continuous_numeric_cols = [col for col in X_raw.columns if col not in port_cols and
pd.api.types.is_numeric_dtype(X_raw[col])]
numeric_pipeline = Pipeline([
('fill_missing', SimpleImputer(strategy='median')),
('smart_scaling', RobustScaler())
])
# Build the simplified preprocessor
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_pipeline, continuous_numeric_cols),
('ports', 'passthrough', port_cols)
],
remainder='drop'
)
smote = SMOTE(random_state=42)
# Note: If X_raw still contains NaNs or Infs, SMOTE will actually crash here.
X_resampled, y_resampled =
smote.fit_resample(X_raw, y_raw)
# Split the RAW data (No leakage!)
X_train, X_cv, y_train_raw, y_cv_raw = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
# Encode the target labels (XGBoost needs integers)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)
y_cv = label_encoder.fit_transform(y_cv_raw)
```