uber

import pandas as pd

import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error , r2_score , mean_absolute_error

from math import cos, asin, sqrt, pi

def distance(lat_1, lat_2, lon_1, lon_2):

    lat_1, lat_2, lon_1, lon_2 = map(np.radians, [lat_1, lat_2, lon_1, lon_2])

    diff_lat = lat_2 – lat_1

    diff_lon = lon_2 – lon_1

    km = 2 * 6371 * np.arcsin(np.sqrt( np.sin(diff_lat/2.0)**2 + np.cos(lat_1) * np.cos(lat_2) * np.sin(diff_lon/2.0)**2 ))

    return km

df[‘distance’] = distance(df[‘pickup_latitude’], df[‘dropoff_latitude’], df[‘pickup_longitude’], df[‘dropoff_longitude’])

df.head()

df = df[

    (df.fare_amount > 0) & (df.fare_amount < 100) &

    (df.passenger_count > 0) & (df.passenger_count < 6) & (df.distance < 5)

]

for col in df.select_dtypes(exclude = [‘object’]):

    sns.boxplot(data = df , x = col)

    plt.title(f”Boxplot of {col}”)

    plt.show()  #range visualize


corr = df[[‘fare_amount’, ‘passenger_count’, ‘distance’, ‘year’, ‘month’, ‘day_of_week’, ‘hour’]].corr()

sns.heatmap(corr, annot = True, fmt=’.2f’, cmap = ‘coolwarm’)

plt.title(“Correlation Matrix”)

plt.show()

x = df[[‘distance’, ‘passenger_count’, ‘hour’, ‘day_of_week’]]

y = df[‘fare_amount’]

x_train, x_test, y_train, y_test = train_test_split(

    x, y, test_size = 0.2, random_state = 42

)

model_lr = LinearRegression()

model_lr.fit(x_train, y_train)

y_pred_lr = model_lr.predict(x_test)

model_rf = RandomForestRegressor(n_estimators = 100, random_state = 42)

model_rf.fit(x_train, y_train)

y_pred_rf = model_rf.predict(x_test)

def evaluate(y_true, y_pred, model):

    R2 = r2_score(y_true, y_pred)

    MAE = mean_absolute_error(y_true, y_pred)

    RMSE = np.sqrt(mean_squared_error(y_true, y_pred))

    print(f”{model} Results : “)

    print(f”    Rยฒ Score : {R2:.4f}”)

    print(f”    MAE : {MAE:.4f}”)

    print(f”    RMSE : {RMSE:.4f}\n”)

evaluate(y_test, y_pred_lr, “Linear Regression”)

evaluate(y_test, y_pred_rf, “Random Forest Regression”)