import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error , r2_score , mean_absolute_error
from math import cos, asin, sqrt, pi
def distance(lat_1, lat_2, lon_1, lon_2):
lat_1, lat_2, lon_1, lon_2 = map(np.radians, [lat_1, lat_2, lon_1, lon_2])
diff_lat = lat_2 – lat_1
diff_lon = lon_2 – lon_1
km = 2 * 6371 * np.arcsin(np.sqrt( np.sin(diff_lat/2.0)**2 + np.cos(lat_1) * np.cos(lat_2) * np.sin(diff_lon/2.0)**2 ))
return km
df[‘distance’] = distance(df[‘pickup_latitude’], df[‘dropoff_latitude’], df[‘pickup_longitude’], df[‘dropoff_longitude’])
df.head()
df = df[
(df.fare_amount > 0) & (df.fare_amount < 100) &
(df.passenger_count > 0) & (df.passenger_count < 6) & (df.distance < 5)
]
for col in df.select_dtypes(exclude = [‘object’]):
sns.boxplot(data = df , x = col)
plt.title(f”Boxplot of {col}”)
plt.show() #range visualize
corr = df[[‘fare_amount’, ‘passenger_count’, ‘distance’, ‘year’, ‘month’, ‘day_of_week’, ‘hour’]].corr()
sns.heatmap(corr, annot = True, fmt=’.2f’, cmap = ‘coolwarm’)
plt.title(“Correlation Matrix”)
plt.show()
x = df[[‘distance’, ‘passenger_count’, ‘hour’, ‘day_of_week’]]
y = df[‘fare_amount’]
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size = 0.2, random_state = 42
)
model_lr = LinearRegression()
model_lr.fit(x_train, y_train)
y_pred_lr = model_lr.predict(x_test)
model_rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
model_rf.fit(x_train, y_train)
y_pred_rf = model_rf.predict(x_test)
def evaluate(y_true, y_pred, model):
R2 = r2_score(y_true, y_pred)
MAE = mean_absolute_error(y_true, y_pred)
RMSE = np.sqrt(mean_squared_error(y_true, y_pred))
print(f”{model} Results : “)
print(f” Rยฒ Score : {R2:.4f}”)
print(f” MAE : {MAE:.4f}”)
print(f” RMSE : {RMSE:.4f}\n”)
evaluate(y_test, y_pred_lr, “Linear Regression”)
evaluate(y_test, y_pred_rf, “Random Forest Regression”)
