Python scripts¶
The following script was used to lighten the work done in the notebooks:
"""
# Project AutoOne_regression
Auxiliary functions for cleanning data and building regression models
- Erick Medeiros Anastácio
- 2020-06-28
- Python 3.7
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
import time
def clean_data(df):
"""Cleans the incoming data.
Args:
df (pands df): Incoming dat
Returns:
df: Clean data.
"""
df = df.copy()
df = df.replace(to_replace='?', value=np.NaN)
df.columns = df.columns.str.replace("-", "_")
df.drop('normalized_losses', axis=1, inplace=True)
df.dropna(inplace=True)
# fix numeric
for col in ['bore', 'stroke', 'horsepower', 'peak_rpm', 'price']:
df[col] = df[col].astype('float')
# lets replace drive_wheels 4wd with fwd
df['drive_wheels'].replace('4wd', 'fwd', inplace=True)
return df
def make_regressor(model_name, model, grid_params, data):
"""Trains and scores models using sklearn GridSearchCV.
Args:
model_name (str): the name of the model.
model (object): the sklearn instance of the model.
grid_params (dict): the grid parameters for the model.
Returns:
tuple: Contains the fitted model, the predictions on the test set and
a dict containing:
- 'r2': r2 score
- 'mse': mse score
- 'parameters': grid parameters for the best model
"""
start_time = time.time()
X_train_proc, y_train, X_test_proc, y_test = data
# the GridSearchCV
grid = GridSearchCV(
model, grid_params,
scoring='neg_mean_squared_error',
n_jobs=-1, cv=3
)
grid.fit(X_train_proc, y_train)
# get the best model & params
model = grid.best_estimator_
parameters = model.get_params()
# lets score the model on the test set
y_predictions = np.exp(model.predict(X_test_proc))
mse = metrics.mean_squared_error(np.exp(y_test), y_predictions)
r2 = model.score(X_test_proc, y_test)
end_time = time.time()
elapsed = end_time - start_time
message = f'Score r2: {r2:.4} \nScore MSE: {mse:.4} \nTime: {elapsed:.2}s'
print(model_name)
print(message)
print(parameters)
stats = {
'model name': model_name,
'r2': r2,
'mse': mse,
'parameters': parameters,
'time': elapsed
}
return model, y_predictions, stats