728x90

현재 차원에서 선형 분리 불가능하다면, 적당한 투영 방법을 찾아야 한다. 한차원 더 높여서 분할을 할수도 있는데, KLDA같은 방법이 있다. 즉 커널 함수를 사용하면 된다. 

6.6 커널 함수를 참조. 커널 선형 판별 분석에 대한 내용이 있다.

 

 

 

 

728x90

 

The Data Engineering Cookbook.pdf
3.27MB

전체 91pages 분량으로, 내용이 많은 것은 아니나 data engineering 관련 지식을 정리해 놓은 책입니다. 

 

도커(Docker)나 REST APIs 등 데이터 사이언티스트가 알아야할 내용도 많이 포함되어 있습니다. 데이터 엔지니어링 책이지만 데이터 사이언티스트도 최소 이정도 지식은 있어야 하지 않을까 (개인적으로)생각합니다.

 

저자의 github (github.com/andkret/Cookbook)에서 관련 코드도 확인할 수 있습니다.

 

내용은 다음과 같은데, 

책에 모든 내용이 자세히 담겨 있다기 보다는 관련 링크가 첨부된 형식입니다.

예를 들어 data science@ Uber 링크를 클릭하면, 관련 tech blog나 링크가 첨부되어 있습니다.

 

많은 분들께 필요한 자료인듯하여 공유합니다:)

기회가 된다면 조금씩 정리해서 공유하도록 하겠습니다.

 

Contents:

Full Table Of Contents:

Introduction

Basic Engineering Skills

Advanced Engineering Skills

Hands On Course

Case Studies

Best Practices Cloud Platforms

130+ Free Data Sources For Data Science

1001 Interview Questions

Recommended Books and Courses

728x90

3.5 수박 데이터 세트 3.0𝛼를 사용해 선형 판별분석에 대한 코드를 작성하고 결과를 기술하라.

 

참고 답안 코드 (1):


import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

class LDA(object):

    def fit(self, X_, y_, plot_=False):
        pos = y_ == 1
        neg = y_ == 0
        X0 = X_[neg]
        X1 = X_[pos]

        u0 = X0.mean(0, keepdims=True)  # (1, n)
        u1 = X1.mean(0, keepdims=True)

        sw = np.dot((X0 - u0).T, X0 - u0) + np.dot((X1 - u1).T, X1 - u1)
        w = np.dot(np.linalg.inv(sw), (u0 - u1).T).reshape(1, -1)  # (1, n)

        if plot_:
            fig, ax = plt.subplots()
            ax.spines['right'].set_color('none')
            ax.spines['top'].set_color('none')
            ax.spines['left'].set_position(('data', 0))
            ax.spines['bottom'].set_position(('data', 0))

            plt.scatter(X1[:, 0], X1[:, 1], c='k', marker='o', label='good')
            plt.scatter(X0[:, 0], X0[:, 1], c='r', marker='x', label='bad')

            plt.xlabel('밀도', labelpad=1)
            plt.ylabel('당도')
            plt.legend(loc='upper right')

            x_tmp = np.linspace(-0.05, 0.15)
            y_tmp = x_tmp * w[0, 1] / w[0, 0]
            plt.plot(x_tmp, y_tmp, '#808080', linewidth=1)

            wu = w / np.linalg.norm(w)

            
            X0_project = np.dot(X0, np.dot(wu.T, wu))
            plt.scatter(X0_project[:, 0], X0_project[:, 1], c='r', s=15)
            for i in range(X0.shape[0]):
                plt.plot([X0[i, 0], X0_project[i, 0]], [X0[i, 1], X0_project[i, 1]], '--r', linewidth=1)

            X1_project = np.dot(X1, np.dot(wu.T, wu))
            plt.scatter(X1_project[:, 0], X1_project[:, 1], c='k', s=15)
            for i in range(X1.shape[0]):
                plt.plot([X1[i, 0], X1_project[i, 0]], [X1[i, 1], X1_project[i, 1]], '--k', linewidth=1)

            
            u0_project = np.dot(u0, np.dot(wu.T, wu))
            plt.scatter(u0_project[:, 0], u0_project[:, 1], c='#FF4500', s=60)
            u1_project = np.dot(u1, np.dot(wu.T, wu))
            plt.scatter(u1_project[:, 0], u1_project[:, 1], c='#696969', s=60)

            ax.annotate(r'u0 투영 포인트',
                        xy=(u0_project[:, 0], u0_project[:, 1]),
                        xytext=(u0_project[:, 0] - 0.2, u0_project[:, 1] - 0.1),
                        size=13,
                        va="center", ha="left",
                        arrowprops=dict(arrowstyle="->",
                                        color="k",
                                        )
                        )

            ax.annotate(r'u1 투영 포인트',
                        xy=(u1_project[:, 0], u1_project[:, 1]),
                        xytext=(u1_project[:, 0] - 0.1, u1_project[:, 1] + 0.1),
                        size=13,
                        va="center", ha="left",
                        arrowprops=dict(arrowstyle="->",
                                        color="k",
                                        )
                        )
            plt.axis("equal")  
            plt.show()

        self.w = w
        self.u0 = u0
        self.u1 = u1
        return self

    def predict(self, X):
        project = np.dot(X, self.w.T)

        wu0 = np.dot(self.w, self.u0.T)
        wu1 = np.dot(self.w, self.u1.T)

        return (np.abs(project - wu1) < np.abs(project - wu0)).astype(int)

if __name__ == '__main__':
	  #data 경로는 사용자에 맞게 바꿔줍니다
		data_path = r'C:\Users\hanmi\Documents\xiguabook\watermelon3_0_Ch.csv'

    data = pd.read_csv(data_path).values

    X = data[:, 7:9].astype(float)
    y = data[:, 9]

    y[y == 'yes'] = 1
    y[y == 'no'] = 0
    y = y.astype(int)

    lda = LDA()
    lda.fit(X, y, plot_=True)
    print(lda.predict(X))  
    print(y)

 

참고 답안 코드 (2):

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def LDA(X0, X1):
    """
    Get the optimal params of LDA model given training data.
    Input:
        X0: np.array with shape [N1, d]
        X1: np.array with shape [N2, d]
    Return:
        omega: np.array with shape [1, d]. Optimal params of LDA.
    """
    #shape [1, d]
    mean0 = np.mean(X0, axis=0, keepdims=True)
    mean1 = np.mean(X1, axis=0, keepdims=True)
    Sw = (X0-mean0).T.dot(X0-mean0) + (X1-mean1).T.dot(X1-mean1)
    omega = np.linalg.inv(Sw).dot((mean0-mean1).T)
    return omega

if __name__=="__main__":
    #read data from xls
    work_book = pd.read_csv("watermelon_3a.csv", header=None)
    positive_data = work_book.values[work_book.values[:, -1] == 1.0, :]
    negative_data = work_book.values[work_book.values[:, -1] == 0.0, :]
    print (positive_data)

    #LDA
    omega = LDA(negative_data[:, 1:-1], positive_data[:, 1:-1])

    #plot
    plt.plot(positive_data[:, 1], positive_data[:, 2], "bo")
    plt.plot(negative_data[:, 1], negative_data[:, 2], "r+")
    lda_left = 0
    lda_right = -(omega[0]*0.9) / omega[1]
    plt.plot([0, 0.9], [lda_left, lda_right], 'g-')

    plt.xlabel('density')
    plt.ylabel('sugar rate')
    plt.title("LDA")
    plt.show()
————————————————
source::https://blog.csdn.net/weixin_43518584/article/details/105588310
728x90

3.4 UCI 데이터 세트에서 두 개의 데이터를 골라 10-fold 교차 검증법과 Leav-one-out이 측정한 로지스틱 회귀의 오차율을 비교하라.

 

참고 답안 코드 (1):

import numpy as np
from sklearn import linear_model
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

#data 경로는 사용자가 지정해 놓은 위치로 변경합니다.
data_path = r'C:\Users\hanmi\Documents\xiguabook\Transfusion.txt'

data = np.loadtxt(data_path, delimiter=',').astype(int)

X = data[:, :4]
y = data[:, 4]

m, n = X.shape

# normalization
X = (X - X.mean(0)) / X.std(0)

# shuffle
index = np.arange(m)
np.random.shuffle(index)

X = X[index]
y = y[index]

# sklearn 라이브러리를 사용
# k-10 cross validation
lr = linear_model.LogisticRegression(C=2)

score = cross_val_score(lr, X, y, cv=10)

print(score.mean())

# LOO
loo = LeaveOneOut()

accuracy = 0
for train, test in loo.split(X, y):
    lr_ = linear_model.LogisticRegression(C=2)
    X_train = X[train]
    X_test = X[test]
    y_train = y[train]
    y_test = y[test]
    lr_.fit(X_train, y_train)

    accuracy += lr_.score(X_test, y_test)

print(accuracy / m)

# 결과는 비슷합니다.

# k-10

num_split = int(m / 10)
score_my = []
for i in range(10):
    lr_ = linear_model.LogisticRegression(C=2)
    test_index = range(i * num_split, (i + 1) * num_split)
    X_test_ = X[test_index]
    y_test_ = y[test_index]

    X_train_ = np.delete(X, test_index, axis=0)
    y_train_ = np.delete(y, test_index, axis=0)

    lr_.fit(X_train_, y_train_)

    score_my.append(lr_.score(X_test_, y_test_))

print(np.mean(score_my))

# LOO
score_my_loo = []
for i in range(m):
    lr_ = linear_model.LogisticRegression(C=2)
    X_test_ = X[i, :]
    y_test_ = y[i]

    X_train_ = np.delete(X, i, axis=0)
    y_train_ = np.delete(y, i, axis=0)

    lr_.fit(X_train_, y_train_)

    score_my_loo.append(int(lr_.predict(X_test_.reshape(1, -1)) == y_test_))

print(np.mean(score_my_loo))

# 결과는 모두 비슷합니다.

 

참고 답안 코드 (2):

```python
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

#data 경로는 사용자가 지정해 놓은 위치로 변경합니다.
data_path = r'C:\Users\hanmi\Documents\xiguabook\Transfusion.txt'

data = np.loadtxt(data_path, delimiter=',').astype(int)

X = data[:, :4]
y = data[:, 4]

m, n = X.shape

# normalization
X = (X - X.mean(0)) / X.std(0)

# shuffle
index = np.arange(m)
np.random.shuffle(index)

X = X[index]
y = y[index]

# sklearn 라이브러리를 사용
# k-10 cross validation
lr = linear_model.LogisticRegression(C=2)

score = cross_val_score(lr, X, y, cv=10)

print(score.mean())

# LOO
loo = LeaveOneOut()

accuracy = 0
for train, test in loo.split(X, y):
    lr_ = linear_model.LogisticRegression(C=2)
    X_train = X[train]
    X_test = X[test]
    y_train = y[train]
    y_test = y[test]
    lr_.fit(X_train, y_train)

    accuracy += lr_.score(X_test, y_test)

print(accuracy / m)

# 결과는 비슷합니다.

# k-10

num_split = int(m / 10)
score_my = []
for i in range(10):
    lr_ = linear_model.LogisticRegression(C=2)
    test_index = range(i * num_split, (i + 1) * num_split)
    X_test_ = X[test_index]
    y_test_ = y[test_index]

    X_train_ = np.delete(X, test_index, axis=0)
    y_train_ = np.delete(y, test_index, axis=0)

    lr_.fit(X_train_, y_train_)

    score_my.append(lr_.score(X_test_, y_test_))

print(np.mean(score_my))

# LOO
score_my_loo = []
for i in range(m):
    lr_ = linear_model.LogisticRegression(C=2)
    X_test_ = X[i, :]
    y_test_ = y[i]

    X_train_ = np.delete(X, i, axis=0)
    y_train_ = np.delete(y, i, axis=0)

    lr_.fit(X_train_, y_train_)

    score_my_loo.append(int(lr_.predict(X_test_.reshape(1, -1)) == y_test_))

print(np.mean(score_my_loo))

# 결과는 모두 비슷합니다.
```
source: https://blog.csdn.net/weixin_43518584/article/details/105588310
'''
728x90

3.3 수박 데이터 세트 3.0𝛼를 사용해 로지스틱 회귀에 대한 코드를 작성하고 결과를 기술하라.

 

참고 답안 (1)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
#데이터 불러오기
data = np.array([[0.697, 0.460, 1],
        [0.774, 0.376, 1],
        [0.634, 0.264, 1],
        [0.608, 0.318, 1],
        [0.556, 0.215, 1],
        [0.403, 0.237, 1],
        [0.481, 0.149, 1],
        [0.437, 0.211, 1],
        [0.666, 0.091, 0],
        [0.243, 0.267, 0],
        [0.245, 0.057, 0],
        [0.343, 0.099, 0],
        [0.639, 0.161, 0],
        [0.657, 0.198, 0],
        [0.360, 0.370, 0],
        [0.593, 0.042, 0],
        [0.719, 0.103, 0]])
X = data[:,0:2]
y = data[:,2]
#데이터 분할
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.25,random_state=33)
def sigmoid(z): 
    s = 1 / (1 + np.exp(-z))
    return s
def initialize_with_zeros(dim):
    """
    This function creates a vector of zeros of shape (dim, 1) for w and initializes b to 0.
    """
    w = np.zeros((dim, 1))
    b = 0
    assert (w.shape == (dim, 1))
    assert (isinstance(b, float) or isinstance(b, int))
    return w, b
def propagate(w, b, X, Y):
    """
    Implement the cost function and its gradient for the propagation explained above
    """
    m = X.shape[1]
    A = sigmoid(np.dot(w.T, X) + b)
      # cost 계산
    cost = -np.sum(Y * np.log(A) + (1 - Y) * np.log(1 - A))/ m  
  
    dw = np.dot(X, (A - Y).T) / m
    db = np.sum(A - Y) / m
    assert (dw.shape == w.shape)
    assert (db.dtype == float)
    cost = np.squeeze(cost)
    assert (cost.shape == ())
    grads = {"dw": dw,
             "db": db}
    return grads, cost
def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost=False):
    """
    This function optimizes w and b by running a gradient descent algorithm
    """
    costs = []
    for i in range(num_iterations):
        grads, cost = propagate(w, b, X, Y)
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        # update rule 
        w = w - learning_rate * dw
        b = b - learning_rate * db
        # Record the costs
        if i % 100 == 0:
            costs.append(cost)
        # Print the cost every 100 training iterations
        if print_cost and i % 100 == 0:
            print("Cost after iteration %i: %f" % (i, cost))
    params = {"w": w,
              "b": b}
    grads = {"dw": dw,
             "db": db}
    return params, grads, costs
def predict(w, b, X):
    '''
    Predict whether the label is 0 or 1 using learned logistic regression parameters (w, b)
    '''
    m = X.shape[1]
    Y_prediction = np.zeros((1, m))
    w = w.reshape(X.shape[0], 1)
    # Compute vector "A" predicting the probabilities of a cat being present in the picture
    A = sigmoid(np.dot(w.T, X) + b)
    for i in range(A.shape[1]):
        # Convert probabilities A[0,i] to actual predictions p[0,i]
        if A[0, i] >= 0.5:
            Y_prediction[0, i] = 1
        else:
            Y_prediction[0, i] = 0
        pass
    assert (Y_prediction.shape == (1, m))
    return Y_prediction
def model(X_train, Y_train, X_test, Y_test, num_iterations, learning_rate, print_cost=False):
    # initialize parameters with zeros (≈ 1 line of code)
    w, b = initialize_with_zeros(X_train.shape[0])
    # Gradient descent (≈ 1 line of code)
    parameters, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)
    # Retrieve parameters w and b from dictionary "parameters"
    w = parameters["w"]
    b = parameters["b"]
    # Predict test/train set examples (≈ 2 lines of code)
    Y_prediction_test = predict(w, b, X_test)
    Y_prediction_train = predict(w, b, X_train)
    # Print train/test Errors
    print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))
    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test,
         "Y_prediction_train": Y_prediction_train,
         "w": w,
         "b": b,
         "learning_rate": learning_rate,
         "num_iterations": num_iterations}
    return d
X_train = X_train.T
Y_train = Y_train.T.reshape(1,X_train.shape[1])
X_test = X_test.T
Y_test = Y_test.T.reshape(1,X_test.shape[1])
d = model(X_train, Y_train, X_test, Y_test, num_iterations = 2000, learning_rate = 0.5, print_cost = True)
# Plot learning curve (with costs)
costs = np.squeeze(d['costs'])
plt.plot(costs)
plt.ylabel('cost')
plt.xlabel('iterations (per hundreds)')
plt.title("Learning rate =" + str(d["learning_rate"]))
plt.show()

 

참고 답안 (2):

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import linear_model


def sigmoid(x):
    s = 1 / (1 + np.exp(-x))
    return s


def J_cost(X, y, beta):
    '''
    :param X:  sample array, shape(n_samples, n_features)
    :param y: array-like, shape (n_samples,)
    :param beta: the beta in formula 3.27 , shape(n_features + 1, ) or (n_features + 1, 1)
    :return: the result of formula 3.27
    '''
    X_hat = np.c_[X, np.ones((X.shape[0], 1))]
    beta = beta.reshape(-1, 1)
    y = y.reshape(-1, 1)

    Lbeta = -y * np.dot(X_hat, beta) + np.log(1 + np.exp(np.dot(X_hat, beta)))

    return Lbeta.sum()


def gradient(X, y, beta):
    '''
    compute the first derivative of J(i.e. formula 3.27) with respect to beta      i.e. formula 3.30
    ----------------------------------
    :param X: sample array, shape(n_samples, n_features)
    :param y: array-like, shape (n_samples,)
    :param beta: the beta in formula 3.27 , shape(n_features + 1, ) or (n_features + 1, 1)
    :return:
    '''
    X_hat = np.c_[X, np.ones((X.shape[0], 1))]
    beta = beta.reshape(-1, 1)
    y = y.reshape(-1, 1)
    p1 = sigmoid(np.dot(X_hat, beta))

    gra = (-X_hat * (y - p1)).sum(0)

    return gra.reshape(-1, 1)


def hessian(X, y, beta):
    '''
    compute the second derivative of J(i.e. formula 3.27) with respect to beta      i.e. formula 3.31
    ----------------------------------
    :param X: sample array, shape(n_samples, n_features)
    :param y: array-like, shape (n_samples,)
    :param beta: the beta in formula 3.27 , shape(n_features + 1, ) or (n_features + 1, 1)
    :return:
    '''
    X_hat = np.c_[X, np.ones((X.shape[0], 1))]
    beta = beta.reshape(-1, 1)
    y = y.reshape(-1, 1)

    p1 = sigmoid(np.dot(X_hat, beta))

    m, n = X.shape
    P = np.eye(m) * p1 * (1 - p1)

    assert P.shape[0] == P.shape[1]
    return np.dot(np.dot(X_hat.T, P), X_hat)


def update_parameters_gradDesc(X, y, beta, learning_rate, num_iterations, print_cost):
    '''
    update parameters with gradient descent method
    --------------------------------------------
    :param beta:
    :param grad:
    :param learning_rate:
    :return:
    '''
    for i in range(num_iterations):

        grad = gradient(X, y, beta)
        beta = beta - learning_rate * grad

        if (i % 10 == 0) & print_cost:
            print('{}th iteration, cost is {}'.format(i, J_cost(X, y, beta)))

    return beta


def update_parameters_newton(X, y, beta, num_iterations, print_cost):
    '''
    update parameters with Newton method
    :param beta:
    :param grad:
    :param hess:
    :return:
    '''

    for i in range(num_iterations):

        grad = gradient(X, y, beta)
        hess = hessian(X, y, beta)
        beta = beta - np.dot(np.linalg.inv(hess), grad)

        if (i % 10 == 0) & print_cost:
            print('{}th iteration, cost is {}'.format(i, J_cost(X, y, beta)))
    return beta


def initialize_beta(n):
    beta = np.random.randn(n + 1, 1) * 0.5 + 1
    return beta


def logistic_model(X, y, num_iterations=100, learning_rate=1.2, print_cost=False, method='gradDesc'):
    '''
    :param X:
    :param y:~
    :param num_iterations:
    :param learning_rate:
    :param print_cost:
    :param method: str 'gradDesc' or 'Newton'
    :return:
    '''
    m, n = X.shape
    beta = initialize_beta(n)

    if method == 'gradDesc':
        return update_parameters_gradDesc(X, y, beta, learning_rate, num_iterations, print_cost)
    elif method == 'Newton':
        return update_parameters_newton(X, y, beta, num_iterations, print_cost)
    else:
        raise ValueError('Unknown solver %s' % method)


def predict(X, beta):
    X_hat = np.c_[X, np.ones((X.shape[0], 1))]
    p1 = sigmoid(np.dot(X_hat, beta))

    p1[p1 >= 0.5] = 1
    p1[p1 < 0.5] = 0

    return p1


if __name__ == '__main__':
    data_path = r'C:\Users\hanmi\Documents\xiguabook\watermelon3_0_Ch.csv'
    #
    data = pd.read_csv(data_path).values

    is_good = data[:, 9] == 'yes'
    is_bad = data[:, 9] == 'no'

    X = data[:, 7:9].astype(float)
    y = data[:, 9]

    y[y == 'yes'] = 1
    y[y == 'no'] = 0
    y = y.astype(int)

    plt.scatter(data[:, 7][is_good], data[:, 8][is_good], c='k', marker='o')
    plt.scatter(data[:, 7][is_bad], data[:, 8][is_bad], c='r', marker='x')

    plt.xlabel('밀도')
    plt.ylabel('당도')

    # 결과 시각화
    beta = logistic_model(X, y, print_cost=True, method='gradDesc', learning_rate=0.3, num_iterations=1000)
    w1, w2, intercept = beta
    x1 = np.linspace(0, 1)
    y1 = -(w1 * x1 + intercept) / w2

    ax1, = plt.plot(x1, y1, label=r'my_logistic_gradDesc')

    lr = linear_model.LogisticRegression(solver='lbfgs', C=1000)  # 注意sklearn的逻辑回归中,C越大表示正则化程度越低。
    lr.fit(X, y)

    lr_beta = np.c_[lr.coef_, lr.intercept_]
    print(J_cost(X, y, lr_beta))

    # 시각화
    w1_sk, w2_sk = lr.coef_[0, :]

    x2 = np.linspace(0, 1)
    y2 = -(w1_sk * x2 + lr.intercept_) / w2

    ax2, = plt.plot(x2, y2, label=r'sklearn_logistic')

    plt.legend(loc='upper right')
    plt.show()

참고 답안 2의 source는 github.com/han1057578619/MachineLearning_Zhouzhihua_ProblemSets/blob/master/ch3--%E7%BA%BF%E6%80%A7%E6%A8%A1%E5%9E%8B/3.3/3.3-LogisticRegression.py 입니다.

 

 

+ Recent posts