import pandas as pd import numpy as np df = pd.read_csv('/mnt/3/apd/L7/zoo.csv') #df.head() #df.columns # wybierz trzy klasy : 1,2,7 oraz cechy takie jak 'hair', 'milk', 'aquatic', 'eggs': df_127=df[ (df['class_type'] == 1) | (df['class_type'] == 2) | (df['class_type'] == 7)] #df_127 = df df_127['class_type'].unique() f = ['hair', 'milk', 'aquatic', 'tail'] X = np.array(df_127[f]) y = np.array(df_127['class_type']) print(y) import matplotlib.pyplot as plt plt.figure(figsize=(3,3)) plt.hist(y); # In[33]: from sklearn.model_selection import train_test_split from sklearn import tree from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4) clf = tree.DecisionTreeClassifier() clf = clf.fit(X_train, y_train) pred = clf.predict(X_test) print(y_test, pred) # https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html print (accuracy_score(y_test, pred)) CM = confusion_matrix(y_test, pred, labels=[1,2,7]) print(CM) print(classification_report(y_test, pred)) TP7 = sum( (y_test==7) & (pred==7) ) FP7 = sum( (y_test!=7) & (pred==7) ) P7 = TP7/(TP7+FP7) FN7 = sum( (y_test==7) & (pred!=7) ) print(f'TP7={TP7}, FP7={FP7}, FN7={FN7}') R7 = TP7/(TP7+FN7) F1 = 2/(1/P7 + 1/R7) S7 = sum(y_test==7) print(f'P7={P7}, R7={R7}, F1={F1}, S7={S7}') TP2 = sum( (y_test==2) & (pred==2) ) FP2 = sum( (y_test!=2) & (pred==2) ) P2 = TP2/(TP2+FP2) FN2 = sum( (y_test==2) & (pred!=2) ) print(f'TP2={TP2}, FP2={FP2}, FN2={FN2}') R2 = TP2/(TP2+FN2) F1 = 2/(1/P2 + 1/R2) S2 = sum(y_test==2) print(f'P2={P2}, R7={R2}, F1={F1}, S2={S2}') # %% imp = clf.feature_importances_ ft = pd.DataFrame(f) #print(ft) ft_imp=pd.concat([ft, pd.DataFrame(imp)], axis=1) ft_imp.columns = ['Feature', 'Importance'] print(ft_imp) #reklasyfikacja z uzyciem tylko 'znaczacych' cech i=imp.argsort() i2 = i[-2:] f2 = [f[i] for i in i2] X = np.array(df_127[f2]) print(X.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4) clf = tree.DecisionTreeClassifier() clf = clf.fit(X_train, y_train) pred = clf.predict(X_test) print(y_test, pred) print (accuracy_score(y_test, pred)) CM2 = confusion_matrix(y_test, pred) print(CM2) print(classification_report(y_test, pred)) # Decyzyjny las losowy X = np.array(df_127[f]) ## X oryginalen (4 cechy) from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(n_estimators=100) rfc.fit(X_train, y_train) rfc_pred = rfc.predict(X_test) #print(rfc_pred) print (y_test, rfc_pred) print (accuracy_score(y_test, rfc_pred)) print(confusion_matrix(y_test, rfc_pred)) print() print(classification_report(y_test, rfc_pred)) imp = pd.DataFrame(rfc.feature_importances_) #print(imp) ft_imp=pd.concat([ft, imp], axis=1) ft_imp.columns = ['Feature', 'Importance'] ft_imp # In[46]: # Regresja logistyczna from sklearn.linear_model import LogisticRegression lr = LogisticRegression(solver='lbfgs') lr.fit(X_train, y_train) lr_pred = lr.predict(X_test) print(lr_pred) print (accuracy_score(y_test, lr_pred)) print(confusion_matrix(y_test, lr_pred)) print() print(classification_report(y_test, lr_pred)) from sklearn.model_selection import cross_val_score scores = cross_val_score(clf, X, y, cv=5)