Data Preprocessing(1)

Please follow the instructions below:

1) Import the required libraries
2) Load the dataset
3) Take care of the missing data (‘nan’ value): fill them with average value
4) Encode the categorical variable: ① Encode the independent variable ② Encode the dependent variable
5) Split train set and test set with parameters: test_size = 0.2, random_state = 0
6) Apply feature scaling

# Import the required libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('Data.csv')
data = pd.DataFrame(data)
print(f'{data}\n')
X = data.iloc[:, 0:3].values
y = data.iloc[:, -1].values

#Split train set and test set with parameters: test_size = 0.2, random_state = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Take care of the missing data (‘nan’ value): fill them with average value
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X_train[:, 1:3])
X_train[:, 1:3] = imputer.transform(X_train[:, 1:3])
X_test[:, 1:3] = imputer.transform(X_test[:, 1:3])
print(f'train_X:\n{X_train}\ntest_X:\n{X_test}\n')

# Encode the independent variable
ohe = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
ohe.fit(X)
X_train_encoded = ohe.transform(X_train)
X_test_encoded = ohe.transform(X_test)
print(f'train_X_encoded:\n{X_train_encoded}\ntest_X_encoded:\n{X_test_encoded}\n')

# Encode the dependent variable
le = LabelEncoder()
le.fit(y)
y_train_encoded = le.transform(y_train)
y_test_encoded = le.transform(y_test)
print(f'train_y_encoded:\n{y_train_encoded}\ntest_y_encoded:\n{y_test_encoded}\n')

# Apply feature scaling
X_train_final=X_train_encoded
X_test_final=X_test_encoded
stdscaler=StandardScaler()
X_train_final[:,3:6] = stdscaler.fit_transform(X_train_encoded[:, 3:6])
X_test_final[:,3:6] = stdscaler.transform(X_test_encoded[:, 3:6])
print(X_train_final)
print(X_test_final)

Posted

in

Tags: