728x90
3.4 UCI 데이터 세트에서 두 개의 데이터를 골라 10-fold 교차 검증법과 Leav-one-out이 측정한 로지스틱 회귀의 오차율을 비교하라.
참고 답안 코드 (1):
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
#data 경로는 사용자가 지정해 놓은 위치로 변경합니다.
data_path = r'C:\Users\hanmi\Documents\xiguabook\Transfusion.txt'
data = np.loadtxt(data_path, delimiter=',').astype(int)
X = data[:, :4]
y = data[:, 4]
m, n = X.shape
# normalization
X = (X - X.mean(0)) / X.std(0)
# shuffle
index = np.arange(m)
np.random.shuffle(index)
X = X[index]
y = y[index]
# sklearn 라이브러리를 사용
# k-10 cross validation
lr = linear_model.LogisticRegression(C=2)
score = cross_val_score(lr, X, y, cv=10)
print(score.mean())
# LOO
loo = LeaveOneOut()
accuracy = 0
for train, test in loo.split(X, y):
lr_ = linear_model.LogisticRegression(C=2)
X_train = X[train]
X_test = X[test]
y_train = y[train]
y_test = y[test]
lr_.fit(X_train, y_train)
accuracy += lr_.score(X_test, y_test)
print(accuracy / m)
# 결과는 비슷합니다.
# k-10
num_split = int(m / 10)
score_my = []
for i in range(10):
lr_ = linear_model.LogisticRegression(C=2)
test_index = range(i * num_split, (i + 1) * num_split)
X_test_ = X[test_index]
y_test_ = y[test_index]
X_train_ = np.delete(X, test_index, axis=0)
y_train_ = np.delete(y, test_index, axis=0)
lr_.fit(X_train_, y_train_)
score_my.append(lr_.score(X_test_, y_test_))
print(np.mean(score_my))
# LOO
score_my_loo = []
for i in range(m):
lr_ = linear_model.LogisticRegression(C=2)
X_test_ = X[i, :]
y_test_ = y[i]
X_train_ = np.delete(X, i, axis=0)
y_train_ = np.delete(y, i, axis=0)
lr_.fit(X_train_, y_train_)
score_my_loo.append(int(lr_.predict(X_test_.reshape(1, -1)) == y_test_))
print(np.mean(score_my_loo))
# 결과는 모두 비슷합니다.
참고 답안 코드 (2):
```python
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
#data 경로는 사용자가 지정해 놓은 위치로 변경합니다.
data_path = r'C:\Users\hanmi\Documents\xiguabook\Transfusion.txt'
data = np.loadtxt(data_path, delimiter=',').astype(int)
X = data[:, :4]
y = data[:, 4]
m, n = X.shape
# normalization
X = (X - X.mean(0)) / X.std(0)
# shuffle
index = np.arange(m)
np.random.shuffle(index)
X = X[index]
y = y[index]
# sklearn 라이브러리를 사용
# k-10 cross validation
lr = linear_model.LogisticRegression(C=2)
score = cross_val_score(lr, X, y, cv=10)
print(score.mean())
# LOO
loo = LeaveOneOut()
accuracy = 0
for train, test in loo.split(X, y):
lr_ = linear_model.LogisticRegression(C=2)
X_train = X[train]
X_test = X[test]
y_train = y[train]
y_test = y[test]
lr_.fit(X_train, y_train)
accuracy += lr_.score(X_test, y_test)
print(accuracy / m)
# 결과는 비슷합니다.
# k-10
num_split = int(m / 10)
score_my = []
for i in range(10):
lr_ = linear_model.LogisticRegression(C=2)
test_index = range(i * num_split, (i + 1) * num_split)
X_test_ = X[test_index]
y_test_ = y[test_index]
X_train_ = np.delete(X, test_index, axis=0)
y_train_ = np.delete(y, test_index, axis=0)
lr_.fit(X_train_, y_train_)
score_my.append(lr_.score(X_test_, y_test_))
print(np.mean(score_my))
# LOO
score_my_loo = []
for i in range(m):
lr_ = linear_model.LogisticRegression(C=2)
X_test_ = X[i, :]
y_test_ = y[i]
X_train_ = np.delete(X, i, axis=0)
y_train_ = np.delete(y, i, axis=0)
lr_.fit(X_train_, y_train_)
score_my_loo.append(int(lr_.predict(X_test_.reshape(1, -1)) == y_test_))
print(np.mean(score_my_loo))
# 결과는 모두 비슷합니다.
```
source: https://blog.csdn.net/weixin_43518584/article/details/105588310
'''