machine-learning-complete

This repository consists of all the data science tools required to make predictions

Answer to another doubt

machine-learning-complete's People

Contributors

Watchers

machine-learning-complete's Issues

Why do we take different X and y value in classification and regression and only X in clustering

Below is the example for classification in which we take both

X = dataset.iloc[:,[2,3]].values
y = dataset.iloc[:,4].values

Where as in clustering we are only taking
X = dataset.iloc[:,[3,4]].values

Full Code for reference purpose

# Decision Tree Classification

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:,[2,3]].values
y = dataset.iloc[:,4].values 

# Splitting dataset into training and test set

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

#feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

#Fitting classifier into training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy',random_state=0)
classifier.fit(X_train,y_train)

#predicting the result
y_pred = classifier.predict(X_test)

#Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)

#visulaising the training set
from matplotlib.colors import ListedColormap
X_set,y_set = X_train,y_train 
X1,X2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1,stop = X_set[:,0].max()+1,step=0.01 ),
                    np.arange(start = X_set[:,1].min()-1,stop=X_set[:,1].max()+1,step=0.01)
                    )
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
             alpha=0.75,cmap=ListedColormap(('red','green')))

for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j,0],X_set[y_set == j,1],
                c=['blue','black'][i],label=j)

plt.legend()
plt.title('decision_tree(Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated_salary')
plt.show()

#Visualising the test set
from matplotlib.colors import ListedColormap
X_set,y_set = X_test,y_test
X1,X2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1,stop = X_set[:,0].max()+1,step=0.01),
                    np.arange(start = X_set[:,1].min()-1,stop = X_set[:,1].max()+1,step=0.01)
                    )
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
             alpha=0.75,cmap=ListedColormap(('blue','black')))

for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j,0],X_set[y_set==j,1],
                c=['red','green'][i],label=j)
    
plt.legend()
plt.title('decision_tree(Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated_salary')
plt.show()

# K-Means Clustering

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#importing the dataset
dataset = pd.read_csv("mall_customers.csv")
X = dataset.iloc[:,[3,4]].values

#using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = [] #Within-Cluster Sum of Square

for i in range(1,11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++',max_iter = 300,n_init=10,random_state = 0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_) #kmeans.inertia_ computes he wcss. later we append it with wcss

plt.plot(range(1,11),wcss)
plt.title("The elbow method")
plt.xlabel("Number of cluster")
plt.ylabel('Wcss') 
plt.show()   

#Applying kmeans to the mall dataset
kmeans = KMeans(n_clusters = 5,init = 'k-means++',max_iter = 300,n_init = 10,random_state=0)
y_kmeans = kmeans.fit_predict(X)
#for every single client of our dataset the fit_predict is going to tell the cluster to Which client belongs.
#and it'll return it's cluster numbers into a single vector that we'll call y_kmeans

# Visualising the clusters
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
#[y_kmeans == 0] meaning we want the observation that belongs to cluster 1 [y_kmeans == 0,0] meaning we want 1st column of our data X
#X[y_kmeans == 0, 0] by doing this we gave x coordinates of all the observation points that belongs to cluster 1. 
#X[y_kmeans == 0, 1] here by changing 0 to 1 our dataset corresponds to the second column of our data X that is the y coodinate
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
#it is same as above but is use to plot centeroid
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

Recommend Projects