import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split
df = pd.read_csv('uber.csv') //https://www.kaggle.com/datasets/yasserh/uber-fares-dataset df.info()
df.shape df.head() df.isnull()
df.fillna(0, inplace=True) df.head()
df.dropna(axis=1, inplace=True) df.head()
df.isnull().sum()
df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace = True) df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace = True) df.dtypes
df.pickup_datetime = pd.to_datetime(df.pickup_datetime) df.dtypes
df = df.assign(hour = df.pickup_datetime.dt.hour, day = df.pickup_datetime.dt.day, month = df.pickup_datetime.dt.month, year = df.pickup_datetime.dt.year, dayofweek = df.pickup_datetime.dt.dayofweek)
df
df = df.drop(["pickup_datetime"], axis =1) df
from math import *
def distance_formula(longitude1, latitude1, longitude2, latitude2): travel_dist = []
for pos in range (len(longitude1)):
lon1, lan1, lon2, lan2 = map(radians, [longitude1[pos], latitude1[pos], longitude2[pos], latitude2[pos]])
dist_lon = lon2 - lon1
dist_lan = lan2 - lan1
a = sin(dist_lan/2)**2 + cos(lan1) * cos(lan2) * sin(dist_lon/2)**2
#radius of earth = 6371
c = 2 * asin(sqrt(a)) * 6371
travel_dist.append(c)
return travel_dist
df['dist_travel_km'] = distance_formula(df.pickup_longitude.to_numpy(), df.pickup_latitude.to_numpy(), df.dropoff_longitude.to_numpy(), df.dropoff_latitude.to_numpy())
df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20)) plt.show()
def remove_outlier(df1 , col): Q1 = df1[col].quantile(0.25) Q3 = df1[col].quantile(0.75) IQR = Q3 - Q1 lower_whisker = Q1-1.5IQR upper_whisker = Q3+1.5IQR df[col] = np.clip(df1[col] , lower_whisker , upper_whisker) return df1
def treat_outliers_all(df1 , col_list): for c in col_list: df1 = remove_outlier(df , c) return df1
df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20)) plt.show()
corr = df.corr() corr
fig,axis = plt.subplots(figsize = (10,6)) sns.heatmap(df.corr(),annot = True)
df_x = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','hour','day','month','year','dayofweek','dist_travel_km']] df_y = df['fare_amount']
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=1)
df
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train, y_train)
y_pred_lin = reg.predict(x_test) print(y_pred_lin)
from sklearn.ensemble import RandomForestRegressor
#Here n_estimators means number of trees you want to build before making the prediction rf = RandomForestRegressor(n_estimators=100) rf.fit(x_train,y_train)
y_pred_rf = rf.predict(x_test) print(y_pred_rf)
cols = ['Model', 'RMSE', 'R-Squared']
result_tabulation = pd.DataFrame(columns = cols)
from sklearn import metrics from sklearn.metrics import r2_score
reg_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lin)) reg_squared = r2_score(y_test, y_pred_lin)
full_metrics = pd.Series({'Model': "Linear Regression", 'RMSE' : reg_RMSE, 'R-Squared' : reg_squared})
result_tabulation = result_tabulation.append(full_metrics, ignore_index = True)
result_tabulation
rf_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rf)) rf_squared = r2_score(y_test, y_pred_rf)
full_metrics = pd.Series({'Model': "Random Forest ", 'RMSE':rf_RMSE, 'R-Squared': rf_squared})
result_tabulation = result_tabulation.append(full_metrics, ignore_index = True)
result_tabulation