본문 바로가기

Python/MachineLearning

Machine Learning(ML)_iris_data예제

1.iris_data활용한ML기초

1. 붓꽃 데이터 Load

In [5]:
from sklearn.datasets import load_iris

iris_datasets = load_iris()

print(type(iris_datasets), iris_datasets.keys())
<class 'sklearn.utils.Bunch'> dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
In [17]:
print(iris_datasets['data'].shape)
iris_datasets['data']
(150, 4)
Out[17]:
array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5. , 3.2, 1.2, 0.2],
       [5.5, 3.5, 1.3, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [4.4, 3. , 1.3, 0.2],
       [5.1, 3.4, 1.5, 0.2],
       [5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [7. , 3.2, 4.7, 1.4],
       [6.4, 3.2, 4.5, 1.5],
       [6.9, 3.1, 4.9, 1.5],
       [5.5, 2.3, 4. , 1.3],
       [6.5, 2.8, 4.6, 1.5],
       [5.7, 2.8, 4.5, 1.3],
       [6.3, 3.3, 4.7, 1.6],
       [4.9, 2.4, 3.3, 1. ],
       [6.6, 2.9, 4.6, 1.3],
       [5.2, 2.7, 3.9, 1.4],
       [5. , 2. , 3.5, 1. ],
       [5.9, 3. , 4.2, 1.5],
       [6. , 2.2, 4. , 1. ],
       [6.1, 2.9, 4.7, 1.4],
       [5.6, 2.9, 3.6, 1.3],
       [6.7, 3.1, 4.4, 1.4],
       [5.6, 3. , 4.5, 1.5],
       [5.8, 2.7, 4.1, 1. ],
       [6.2, 2.2, 4.5, 1.5],
       [5.6, 2.5, 3.9, 1.1],
       [5.9, 3.2, 4.8, 1.8],
       [6.1, 2.8, 4. , 1.3],
       [6.3, 2.5, 4.9, 1.5],
       [6.1, 2.8, 4.7, 1.2],
       [6.4, 2.9, 4.3, 1.3],
       [6.6, 3. , 4.4, 1.4],
       [6.8, 2.8, 4.8, 1.4],
       [6.7, 3. , 5. , 1.7],
       [6. , 2.9, 4.5, 1.5],
       [5.7, 2.6, 3.5, 1. ],
       [5.5, 2.4, 3.8, 1.1],
       [5.5, 2.4, 3.7, 1. ],
       [5.8, 2.7, 3.9, 1.2],
       [6. , 2.7, 5.1, 1.6],
       [5.4, 3. , 4.5, 1.5],
       [6. , 3.4, 4.5, 1.6],
       [6.7, 3.1, 4.7, 1.5],
       [6.3, 2.3, 4.4, 1.3],
       [5.6, 3. , 4.1, 1.3],
       [5.5, 2.5, 4. , 1.3],
       [5.5, 2.6, 4.4, 1.2],
       [6.1, 3. , 4.6, 1.4],
       [5.8, 2.6, 4. , 1.2],
       [5. , 2.3, 3.3, 1. ],
       [5.6, 2.7, 4.2, 1.3],
       [5.7, 3. , 4.2, 1.2],
       [5.7, 2.9, 4.2, 1.3],
       [6.2, 2.9, 4.3, 1.3],
       [5.1, 2.5, 3. , 1.1],
       [5.7, 2.8, 4.1, 1.3],
       [6.3, 3.3, 6. , 2.5],
       [5.8, 2.7, 5.1, 1.9],
       [7.1, 3. , 5.9, 2.1],
       [6.3, 2.9, 5.6, 1.8],
       [6.5, 3. , 5.8, 2.2],
       [7.6, 3. , 6.6, 2.1],
       [4.9, 2.5, 4.5, 1.7],
       [7.3, 2.9, 6.3, 1.8],
       [6.7, 2.5, 5.8, 1.8],
       [7.2, 3.6, 6.1, 2.5],
       [6.5, 3.2, 5.1, 2. ],
       [6.4, 2.7, 5.3, 1.9],
       [6.8, 3. , 5.5, 2.1],
       [5.7, 2.5, 5. , 2. ],
       [5.8, 2.8, 5.1, 2.4],
       [6.4, 3.2, 5.3, 2.3],
       [6.5, 3. , 5.5, 1.8],
       [7.7, 3.8, 6.7, 2.2],
       [7.7, 2.6, 6.9, 2.3],
       [6. , 2.2, 5. , 1.5],
       [6.9, 3.2, 5.7, 2.3],
       [5.6, 2.8, 4.9, 2. ],
       [7.7, 2.8, 6.7, 2. ],
       [6.3, 2.7, 4.9, 1.8],
       [6.7, 3.3, 5.7, 2.1],
       [7.2, 3.2, 6. , 1.8],
       [6.2, 2.8, 4.8, 1.8],
       [6.1, 3. , 4.9, 1.8],
       [6.4, 2.8, 5.6, 2.1],
       [7.2, 3. , 5.8, 1.6],
       [7.4, 2.8, 6.1, 1.9],
       [7.9, 3.8, 6.4, 2. ],
       [6.4, 2.8, 5.6, 2.2],
       [6.3, 2.8, 5.1, 1.5],
       [6.1, 2.6, 5.6, 1.4],
       [7.7, 3. , 6.1, 2.3],
       [6.3, 3.4, 5.6, 2.4],
       [6.4, 3.1, 5.5, 1.8],
       [6. , 3. , 4.8, 1.8],
       [6.9, 3.1, 5.4, 2.1],
       [6.7, 3.1, 5.6, 2.4],
       [6.9, 3.1, 5.1, 2.3],
       [5.8, 2.7, 5.1, 1.9],
       [6.8, 3.2, 5.9, 2.3],
       [6.7, 3.3, 5.7, 2.5],
       [6.7, 3. , 5.2, 2.3],
       [6.3, 2.5, 5. , 1.9],
       [6.5, 3. , 5.2, 2. ],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3. , 5.1, 1.8]])
In [13]:
iris_datasets['feature_names']
Out[13]:
['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']
In [18]:
# label(답)
print(iris_datasets['target'].shape)
iris_datasets['target']
(150,)
Out[18]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
In [19]:
# 0: setosa, 1:versicolor, 2:virginica
iris_datasets['target_names']
Out[19]:
array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

1.1 iris_data(Bunch) 변환

  • Pandas의 DataFrame 객체로 변환
In [20]:
import pandas as pd

iris_df = pd.DataFrame(iris_datasets['data'], columns=iris_datasets['feature_names'])
iris_df.head()
Out[20]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
In [21]:
iris_df.tail()
Out[21]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8
In [22]:
# iris_df 에 target 컬럼을 추가
iris_df['target'] = iris_datasets['target']
iris_df.head()
Out[22]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
In [23]:
iris_df.tail()
Out[23]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
145 6.7 3.0 5.2 2.3 2
146 6.3 2.5 5.0 1.9 2
147 6.5 3.0 5.2 2.0 2
148 6.2 3.4 5.4 2.3 2
149 5.9 3.0 5.1 1.8 2
In [26]:
print(type(iris_datasets['target_names']))
print(iris_datasets['target_names'])
dict(enumerate(iris_datasets['target_names']))
<class 'numpy.ndarray'>
['setosa' 'versicolor' 'virginica']
Out[26]:
{0: 'setosa', 1: 'versicolor', 2: 'virginica'}
In [28]:
# iris_df['target']와 같은 표기법
iris_df.target
Out[28]:
0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: target, Length: 150, dtype: int32
In [29]:
# iris_df 애 'label' 컬럼을 추가
iris_df['label'] = iris_df.target.replace(dict(enumerate(iris_datasets['target_names'])))
In [31]:
iris_df.head()
Out[31]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target label
0 5.1 3.5 1.4 0.2 0 setosa
1 4.9 3.0 1.4 0.2 0 setosa
2 4.7 3.2 1.3 0.2 0 setosa
3 4.6 3.1 1.5 0.2 0 setosa
4 5.0 3.6 1.4 0.2 0 setosa
In [32]:
iris_df.tail()
Out[32]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target label
145 6.7 3.0 5.2 2.3 2 virginica
146 6.3 2.5 5.0 1.9 2 virginica
147 6.5 3.0 5.2 2.0 2 virginica
148 6.2 3.4 5.4 2.3 2 virginica
149 5.9 3.0 5.1 1.8 2 virginica
In [34]:
iris_df.iloc[:,0:4].head()
Out[34]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2

2. Train data 와 Test data로 분류하기

  • train data는 머신러닝 Model을 만들 때(즉, 학습을 할 때) 사용하는 data
  • test data 해당 Model이 얼마나 잘 동작하는 지(즉, 예측할 때) 평가할 때 사용하는 data
  • 수학의 함수에서 입력을 x, 출력을 y f(x) = y
  • 입력데이터는 대문자 X로 표기, 레이블은 y로 표기한다.
  • X_train, y_train, X_test, y_test
  • X_train(훈련데이터)는 75%, X_test(테스트데이터)는 25%
In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris_df.iloc[:,0:4],iris_df['target'], random_state=10)
In [64]:
# 훈련데이터의 입력데이터와 레이블(답) 건수
print(X_train.shape)
print(y_train.shape)
(112, 4)
(112,)
In [65]:
# 테스트데이터의 입력데이터와 레이블(답) 건수
print(X_test.shape)
print(y_test.shape)
(38, 4)
(38,)
In [58]:
# X_train, X_test, y_train, y_test = train_test_split(iris_df.iloc[:,0:4],iris_df['target'], test_size=0.33)
# print(X_train.shape)
# print(y_train.shape)
# print(X_test.shape)
# print(y_test.shape)
(100, 4)
(100,)
(50, 4)
(50,)

3. 학습방법(Model 선택)

  • K-NN(nearest neighbors) 최근접 이웃 분류기 사용
  • k지수 - 이웃의 갯수는 3으로 설정
  • 학습하기 model.fit(X_train, y_train)
  • 예측하기 model.predict(X_test)
In [66]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=3)
model
Out[66]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')
In [67]:
# 학습하기
model.fit(X_train, y_train)
Out[67]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')
In [68]:
# 모델을 평가하기(예측하기)
y_predict = model.predict(X_test)
y_predict
Out[68]:
array([1, 2, 0, 1, 0, 1, 2, 1, 0, 1, 1, 2, 1, 0, 0, 2, 1, 0, 0, 0, 2, 2,
       2, 0, 1, 0, 1, 1, 1, 2, 1, 1, 2, 2, 2, 0, 2, 2])
In [69]:
# 예측한 답(y_predict)과 실제 답(y_test)을 비교
import numpy as np

print('테스트 세트의 정확도 {:.2f}'.format(np.mean(y_predict == y_test)))
테스트 세트의 정확도 0.97
In [70]:
print('테스트 세트의 정확도 score함수 사용 {:.2f}'.format(model.score(X_test, y_test)))
테스트 세트의 정확도 score함수 사용 0.97
In [75]:
# n_neighbors 갯수를 다르게 설정해서 model을 여러번 생성 할때 
train_accuracy = []
test_accuracy = []
neighbors_settions = range(1, 20, 2)
for neighbor in neighbors_settions:
    print(neighbor)
    model = KNeighborsClassifier(n_neighbors=neighbor)
    model.fit(X_train, y_train)
    train_accuracy.append(model.score(X_train, y_train))
    test_accuracy.append(model.score(X_test, y_test))
print(train_accuracy)
print(test_accuracy)
1
3
5
7
9
11
13
15
17
19
[1.0, 0.9642857142857143, 0.9732142857142857, 0.9642857142857143, 0.9464285714285714, 0.9732142857142857, 0.9732142857142857, 0.9642857142857143, 0.9642857142857143, 0.9642857142857143]
[0.9736842105263158, 0.9736842105263158, 0.9736842105263158, 1.0, 1.0, 1.0, 1.0, 0.9736842105263158, 0.9736842105263158, 1.0]
In [81]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(neighbors_settions, train_accuracy, label='train accuracy')
plt.plot(neighbors_settions, test_accuracy, label='test accuracy')
plt.ylabel('accuracy')
plt.xlabel('n_neighbors')
plt.legend()
Out[81]:
<matplotlib.legend.Legend at 0x19728e2d6a0>
In [ ]:
 
In [ ]:
 

'Python > MachineLearning' 카테고리의 다른 글

Machine Learning(ML)_Taitanic예제  (0) 2020.08.18
Machine Learning(ML)_iris_data예제  (0) 2020.08.18
Machine Learning(ML)  (0) 2020.08.17