Coder Social home page Coder Social logo

lp3's Introduction

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split


df = pd.read_csv('uber.csv') //https://www.kaggle.com/datasets/yasserh/uber-fares-dataset df.info()


df.shape df.head() df.isnull()


df.fillna(0, inplace=True) df.head()


df.dropna(axis=1, inplace=True) df.head()


df.isnull().sum()


df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace = True) df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace = True) df.dtypes


df.pickup_datetime = pd.to_datetime(df.pickup_datetime) df.dtypes


df = df.assign(hour = df.pickup_datetime.dt.hour, day = df.pickup_datetime.dt.day, month = df.pickup_datetime.dt.month, year = df.pickup_datetime.dt.year, dayofweek = df.pickup_datetime.dt.dayofweek)


df


df = df.drop(["pickup_datetime"], axis =1) df


from math import *

def distance_formula(longitude1, latitude1, longitude2, latitude2): travel_dist = []

for pos in range (len(longitude1)):
    lon1, lan1, lon2, lan2 = map(radians, [longitude1[pos], latitude1[pos], longitude2[pos], latitude2[pos]])
    dist_lon = lon2 - lon1
    dist_lan = lan2 - lan1
    
    a = sin(dist_lan/2)**2 + cos(lan1) * cos(lan2) * sin(dist_lon/2)**2
    
    #radius of earth = 6371
    c = 2 * asin(sqrt(a)) * 6371 
    travel_dist.append(c)
        
return  travel_dist

df['dist_travel_km'] = distance_formula(df.pickup_longitude.to_numpy(), df.pickup_latitude.to_numpy(), df.dropoff_longitude.to_numpy(), df.dropoff_latitude.to_numpy())


df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20)) plt.show()


def remove_outlier(df1 , col): Q1 = df1[col].quantile(0.25) Q3 = df1[col].quantile(0.75) IQR = Q3 - Q1 lower_whisker = Q1-1.5IQR upper_whisker = Q3+1.5IQR df[col] = np.clip(df1[col] , lower_whisker , upper_whisker) return df1

def treat_outliers_all(df1 , col_list): for c in col_list: df1 = remove_outlier(df , c) return df1


df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20)) plt.show()


corr = df.corr() corr


fig,axis = plt.subplots(figsize = (10,6)) sns.heatmap(df.corr(),annot = True)


df_x = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','hour','day','month','year','dayofweek','dist_travel_km']] df_y = df['fare_amount']


x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=1)


df


from sklearn.linear_model import LinearRegression

initialize the linear regression model

reg = LinearRegression()

Train the model with our training data

reg.fit(x_train, y_train)


y_pred_lin = reg.predict(x_test) print(y_pred_lin)


from sklearn.ensemble import RandomForestRegressor

#Here n_estimators means number of trees you want to build before making the prediction rf = RandomForestRegressor(n_estimators=100) rf.fit(x_train,y_train)


y_pred_rf = rf.predict(x_test) print(y_pred_rf)


cols = ['Model', 'RMSE', 'R-Squared']

create a empty dataframe of the colums

columns: specifies the columns to be selected

result_tabulation = pd.DataFrame(columns = cols)


from sklearn import metrics from sklearn.metrics import r2_score

reg_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lin)) reg_squared = r2_score(y_test, y_pred_lin)

full_metrics = pd.Series({'Model': "Linear Regression", 'RMSE' : reg_RMSE, 'R-Squared' : reg_squared})

append our result table using append()

ignore_index=True: does not use the index labels

python can only append a Series if ignore_index=True or if the Series has a name

result_tabulation = result_tabulation.append(full_metrics, ignore_index = True)

print the result table

result_tabulation


rf_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rf)) rf_squared = r2_score(y_test, y_pred_rf)

full_metrics = pd.Series({'Model': "Random Forest ", 'RMSE':rf_RMSE, 'R-Squared': rf_squared})

append our result table using append()

ignore_index=True: does not use the index labels

python can only append a Series if ignore_index=True or if the Series has a name

result_tabulation = result_tabulation.append(full_metrics, ignore_index = True)

print the result table

result_tabulation

lp3's People

Contributors

vishalabhange avatar

Watchers

 avatar

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.