1. DATA_PREPROCESSING Importing the librariees import numpy as np import pandas as pd Importing the datset dataset=pd.read_csv('/Data - Data (1).csv') X=dataset.iloc[:, :-1].values y=dataset.iloc[:,-1].values dataset.iloc[0] #row extraction dataset.iloc[[0]] #column extraction dataset.iloc[0,1] #value extraction dataset.iloc[[0,1]] #multiple row and column extraction dataset.iloc[0:3,0:4] dataset.iloc[0:3,0:2] dataset.iloc[0:3,:-2] print(X) #independent attributes print(y) #dependent attributes Tacking care of missing data from sklearn.impute import SimpleImputer imputer=SimpleImputer(missing_values=np.nan, strategy='mean') imputer.fit(X[:, 1:3]) #calculating mean X[:, 1:3]=imputer.transform(X[:, 1:3]) #filling with mean print(X) Encoding independent categorical column ie Country column ONEHOT from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])], remainder='passthrough') X=np.array(ct.fit_transform(X)) print(X) Encoding Dependent Variable ie Output using Label Encoding from sklearn.preprocessing import LabelEncoder le=LabelEncoder() y=le.fit_transform(y) print(y) Splitting the Dataset into Training and Testing Set from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42) print(X_train) Feature Scaling - Standardization from sklearn.preprocessing import StandardScaler sc=StandardScaler() X_train[:,3:]=sc.fit_transform(X_train[:,3:]) X_test[:,3:]=sc.transform(X_test[:,3:]) print(X_train) <------------------------------------------------------------------------------------------------------> 2. Binary classifier 5 detector from sklearn.datasets import fetch_openml import numpy as np mnist= fetch_openml('mnist_784',cache=True,version=1) mnist.keys() np.sqrt(784) mnist['target'] mnist mnist.target=mnist.target.astype(np.int8) print(mnist['DESCR']) np.unique(mnist['target']) X,y =mnist["data"],mnist["target"] np.unique(mnist["data"]) y.value_counts() X,y =X.values,y.values %matplotlib inline import matplotlib as mpl import matplotlib.pyplot as plt some_digit =X[2] some_digit_image =some_digit.reshape(28,28) plt.imshow(some_digit_image) plt.axis("off") X.shape y=y.astype(np.uint8) digit=X[0] digit y[5] digit=digit.reshape(28,28) digit import matplotlib import matplotlib.pyplot as plt plt.imshow(digit,cmap=matplotlib.cm.binary) plt.axis("off") plt.show() X_train,X_test,y_train,y_test =X[:60000],X[60000:],y[:60000],y[60000:] shuffling_index=np.random.permutation(60000) shuffling_index X_train,ytrain=X_train[shuffling_index],y_train[shuffling_index] X_train y_train_5=y_train==5 y_test_5=y_train==5 from sklearn.linear_model import SGDClassifier sgd_clf=SGDClassifier(max_iter=20,tol=np.infty) sgd_clf.fit(X_train,y_train_5) sgd_clf.predict([X[0]]) y_train_pred = sgd_clf.predict(X_train) y[130] y_train_5=y_train==5 y_test_5=y_train==5 from sklearn.linear_model import SGDClassifier sgd_clf =SGDClassifier(max_iter=20,tol=np.infty) sgd_clf.fit(X_train,y_train_5) from sklearn.metrics import classification_report,confusion_matrix confusion_matrix(y_train_5,y_train_pred) from sklearn import metrics print( f"Classification report for classifier {sgd_clf}:\n" f"{metrics.classification_report(y_train_5,y_train_pred)}\n" ) from sklearn.metrics import precision_score,recall_score precision_score(y_train_5,y_train_pred) recall_score(y_train_5,y_train_pred) <------------------------------------------------------------------------------------------------------> 3. Linear Regression=Using Salary dataset predicts the salary using linear regression. import numpy as np import pandas as pd import matplotlib.pyplot as plt dataset = pd.read_csv('petrolconsumption.csv') dataset.shape dataset.head() dataset.describe() X=dataset[['Petrol_tax','Average_income','Population_Driverlicence']] y=dataset['Petrol_Consumption'] from sklearn.model_selection import train_test_split X_train , X_test , y_train , y_test = train_test_split(X ,y,test_size=0.2 , random_state=0) from sklearn.linear_model import LinearRegression reg=LinearRegression() reg.fit(X_train,y_train) coefficent = pd.DataFrame(reg.coef_, X.columns, columns=['Coefficient']) print(coefficent) y_pred=reg.predict(X_test) df=pd.DataFrame({'Actual':y_test , 'Predicted':y_pred}) print(df) from sklearn import metrics print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)) print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) ************************************************************************************************ 4. Polynomial_Regression import numpy as np import pandas as pd dataset=pd.read_csv('/content/poly_dataset - Sheet1.csv') X=dataset.iloc[:,:-1].values y=dataset.iloc[:,-1].values print(y) from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=1/3,random_state=0) from sklearn.linear_model import LinearRegression lin_reg=LinearRegression() lin_reg.fit(X_train,y_train) import matplotlib.pyplot as plt plt.scatter(X,y,color='red') plt.plot(X,lin_reg.predict(X),color='blue') plt.title('Truth or bluff(linear regression)') plt.xlabel('X') plt.ylabel('Y') plt.show() from sklearn.preprocessing import PolynomialFeatures poly_reg=PolynomialFeatures(degree=3,include_bias=False) X_train_trans=poly_reg.fit_transform(X_train) X_test_trans=poly_reg.fit_transform(X_test) lin_reg_2=LinearRegression() lin_reg_2.fit(X_train_trans,y_train) print(lin_reg_2.coef_) print(lin_reg_2.intercept_) import matplotlib.pyplot as plt plt.scatter(X,y,color='red') plt.plot(X,lin_reg_2.predict(poly_reg.fit_transform(X)),color='blue') plt.title('polynomial regression') plt.xlabel('X') plt.ylabel('Y') plt.show() lin_reg_2.predict(poly_reg.fit_transform([[6.5]])) from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score print("MAE",mean_absolute_error(y_test,lin_reg_2.predict(poly_reg.fit_transform(X_test)))) print("MSE",mean_squared_error(y_test,lin_reg_2.predict(poly_reg.fit_transform(X_test)))) print("RMSE",np.sqrt(mean_squared_error(y_test,lin_reg_2.predict(poly_reg.fit_transform(X_test))))) r2_score(y_test,lin_reg_2.predict(poly_reg.fit_transform(X_test))) ******************************************************************************************* 5. Logistic Regression import numpy as np import matplotlib.pyplot as plt import pandas as pd dataset = pd.read_csv('Social_Network_Ads.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) print(X_train) print(y_train) print(X_test) print(y_test) from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) print(X_train) print(X_test) from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state = 0) #instance of class by invoking function, classifier.fit(X_train, y_train) print(classifier.predict(sc.transform([[30,87000]]))) y_pred = classifier.predict(X_test) print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)) from sklearn.metrics import confusion_matrix, accuracy_score cm = confusion_matrix(y_test, y_pred) print(cm) accuracy_score(y_test, y_pred) ********************************************************************************************************** 6. Decision Tree from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier iris=load_iris() X=iris.data[:,2:] y=iris.target tree_clf=DecisionTreeClassifier(max_depth=2) tree_clf.fit(X,y) from sklearn.tree import export_graphviz export_graphviz( tree_clf, out_file="iris_tree.dot", feature_names=iris.feature_names[2:], class_names=iris.target_names, # rounded=True, filled=True ) !dot -Tpng iris_tree.dot -o iris_tree.png from IPython import display display.Image(filename="iris_tree.png") ****************************************************************************************** 7. PCA mnist from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_openml mnist=fetch_openml('mnist_784') X,y=mnist["data"],mnist["target"] X_train,X_test,y_train,y_test=train_test_split(X,y) X=X_train #applying pca from sklearn.decomposition import PCA import numpy as np pca=PCA() pca.fit(X) cumsum=np.cumsum(pca.explained_variance_ratio_) d=np.argmax(cumsum>=0.95)+1 d #projecting into principle components pca=PCA(n_components=0.95) X_reduced=pca.fit_transform(X) pca.n_components_ #Checking for the variance explained #did we hit the min 95% ? np.sum(pca.explained_variance_ratio_) some_digit=X_reduced[0] # implementing SVM classifier from sklearn.svm import SVC svm_clf=SVC() svm_clf.fit(X_reduced,y_train) svm_clf.predict([some_digit]) ************************************************************************************** 8. Kmeans import numpy as np import pandas as pd import matplotlib.pyplot as plt df=pd.read_csv('/content/Mall_Customers.csv') df.head() x=df.iloc[:,[3,4]].values #annual income in dollers plt.scatter(df['Annual Income (k$)'],df['Spending Score (1-100)']) from sklearn.cluster import KMeans wcss = [] #sse for i in range (1,11): kmeans=KMeans(n_clusters = i) kmeans.fit(x) wcss.append(kmeans.inertia_) plt.plot(range(1,11),wcss) plt.title('The Elbow Method') plt.xlabel('No. of clusters') plt.ylabel('WCSS') plt.show() kmeans = KMeans(n_clusters=5, random_state=42) y_pred=kmeans.fit_predict(x) print(y_pred) df['cluster']=y_pred df.head() uniqueVal=(df['cluster']).unique uniqueVal ****************************************************************************