import pandas as pd
import numpy as np
import random
def load_dataSet():
csv = pd.read_csv('iris.data', header=None)
return csv
pass
def randSplit(dataSet, rate):
"""
随机打乱数据集
:param dataSet: 数据集
:param rate: 提取训练集和测试集的比率
:return: train test
"""
l_index = list(dataSet.index)
random.shuffle(l_index)
dataSet.index = l_index
n = dataSet.shape[0]
m = int(n * rate)
train = dataSet.loc[range(m), :]
test = dataSet.loc[range(m, n), :]
dataSet.index = range(dataSet.shape[0])
test.index = range(test.shape[0])
return train, test
def gnb_classify(train, test):
"""
分类
:param train: 训练集
:param test: 测试集
:return:
"""
labels = train.iloc[:, -1].value_counts().index
mean = []
std = []
result = []
for i in labels:
item = train.loc[train.iloc[:, -1] == i, :]
m = item.iloc[:, :-1].mean()
s = np.sum((item.iloc[:, :-1] - m) ** 2) / (item.shape[0])
mean.append(m)
std.append(s)
means = pd.DataFrame(mean, index=labels)
stds = pd.DataFrame(std, index=labels)
for j in range(test.shape[0]):
iset = test.iloc[j, :-1].tolist()
iprob = np.exp(-1 * (iset - means) ** 2 / (stds * 2)) / (np.sqrt(2 * np.pi * stds))
prob = 1
for k in range(test.shape[-1] - 1):
prob *= iprob[k]
cla = prob.index[np.argmax(prob.values)]
result.append(cla)
test['predict'] = result
acc = (test.iloc[:, -1] == test.iloc[:, -2]).mean()
print("预测准确率: {}".format(acc))
print(test)
if __name__ == '__main__':
data_set = load_dataSet()
_train, _test = randSplit(data_set, 0.8)
gnb_classify(_train, _test)
pass