import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split

df = pd.read_csv('uber.csv') //https://www.kaggle.com/datasets/yasserh/uber-fares-dataset df.info()

df.shape df.head() df.isnull()

df.fillna(0, inplace=True) df.head()

df.dropna(axis=1, inplace=True) df.head()

df.isnull().sum()

df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace = True) df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace = True) df.dtypes

df.pickup_datetime = pd.to_datetime(df.pickup_datetime) df.dtypes

df = df.assign(hour = df.pickup_datetime.dt.hour, day = df.pickup_datetime.dt.day, month = df.pickup_datetime.dt.month, year = df.pickup_datetime.dt.year, dayofweek = df.pickup_datetime.dt.dayofweek)

df = df.drop(["pickup_datetime"], axis =1) df

from math import *

def distance_formula(longitude1, latitude1, longitude2, latitude2): travel_dist = []

for pos in range (len(longitude1)):
    lon1, lan1, lon2, lan2 = map(radians, [longitude1[pos], latitude1[pos], longitude2[pos], latitude2[pos]])
    dist_lon = lon2 - lon1
    dist_lan = lan2 - lan1
    
    a = sin(dist_lan/2)**2 + cos(lan1) * cos(lan2) * sin(dist_lon/2)**2
    
    #radius of earth = 6371
    c = 2 * asin(sqrt(a)) * 6371 
    travel_dist.append(c)
        
return  travel_dist

df['dist_travel_km'] = distance_formula(df.pickup_longitude.to_numpy(), df.pickup_latitude.to_numpy(), df.dropoff_longitude.to_numpy(), df.dropoff_latitude.to_numpy())

df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20)) plt.show()

def remove_outlier(df1 , col): Q1 = df1[col].quantile(0.25) Q3 = df1[col].quantile(0.75) IQR = Q3 - Q1 lower_whisker = Q1-1.5IQR upper_whisker = Q3+1.5IQR df[col] = np.clip(df1[col] , lower_whisker , upper_whisker) return df1

def treat_outliers_all(df1 , col_list): for c in col_list: df1 = remove_outlier(df , c) return df1

df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20)) plt.show()

corr = df.corr() corr

fig,axis = plt.subplots(figsize = (10,6)) sns.heatmap(df.corr(),annot = True)

df_x = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','hour','day','month','year','dayofweek','dist_travel_km']] df_y = df['fare_amount']

x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=1)

from sklearn.linear_model import LinearRegression

initialize the linear regression model

reg = LinearRegression()

Train the model with our training data

reg.fit(x_train, y_train)

y_pred_lin = reg.predict(x_test) print(y_pred_lin)

from sklearn.ensemble import RandomForestRegressor

#Here n_estimators means number of trees you want to build before making the prediction rf = RandomForestRegressor(n_estimators=100) rf.fit(x_train,y_train)

y_pred_rf = rf.predict(x_test) print(y_pred_rf)

cols = ['Model', 'RMSE', 'R-Squared']

create a empty dataframe of the colums

columns: specifies the columns to be selected

result_tabulation = pd.DataFrame(columns = cols)

from sklearn import metrics from sklearn.metrics import r2_score

reg_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lin)) reg_squared = r2_score(y_test, y_pred_lin)

full_metrics = pd.Series({'Model': "Linear Regression", 'RMSE' : reg_RMSE, 'R-Squared' : reg_squared})

append our result table using append()

ignore_index=True: does not use the index labels

python can only append a Series if ignore_index=True or if the Series has a name

result_tabulation = result_tabulation.append(full_metrics, ignore_index = True)

print the result table

result_tabulation

rf_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rf)) rf_squared = r2_score(y_test, y_pred_rf)

full_metrics = pd.Series({'Model': "Random Forest ", 'RMSE':rf_RMSE, 'R-Squared': rf_squared})

append our result table using append()

ignore_index=True: does not use the index labels

python can only append a Series if ignore_index=True or if the Series has a name

result_tabulation = result_tabulation.append(full_metrics, ignore_index = True)

print the result table

result_tabulation

vishalabhange / lp3 Goto Github PK

lp3's Introduction

initialize the linear regression model

Train the model with our training data

create a empty dataframe of the colums

columns: specifies the columns to be selected

append our result table using append()

ignore_index=True: does not use the index labels

python can only append a Series if ignore_index=True or if the Series has a name

print the result table

append our result table using append()

ignore_index=True: does not use the index labels

python can only append a Series if ignore_index=True or if the Series has a name

print the result table

lp3's People

Contributors

Watchers

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent