这学期选修了一门Python在大数据中的应用这门课,是方老师教的,了解了一些数据分析常用的库,Numpy,plt,sklearn等
印象比较深的库有
1.plt 可以对数据进行可视化,利于直观的进行数据分析
2.sklearn 有许多机器学习算法,可以直接用,十分方便
老师留了几道python题,我这次直接拿来做总结
//题目描述:
用scikit-learn加载iris数据集,采用KNN、SVM和朴素贝叶斯算法进行分类,最后比较这三种方法的优缺点。
代码:
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 1 18:24:09 2019
@author: Administrator
"""
# =============================================================================
# 作业要求
# 用scikit-learn加载iris数据集,
# 采用KNN、SVM和朴素贝叶斯算法进行分类,最后比较这三种方法的优缺点。
# =============================================================================
# =============================================================================
# #Iris也称鸢尾花卉数据集,是一类多重变量分析的数据集。
# #可通过花萼长度,花萼宽度,花瓣长度,花瓣宽度4个属性
# #预测鸢尾花卉属于(Setosa,Versicolour,Virginica)三个种类中的哪一类。
# =============================================================================
#导入必要的包
import
numpy
as
np
import
pylab
as
plt
from
sklearn
.
datasets
import
load_iris
from
sklearn
.
model_selection
import
train_test_split
#引入train_test_split函数
from
sklearn
.
neighbors
import
KNeighborsClassifier
#引入KNN分类器
from
sklearn
.
svm
import
SVC
#引入SVM分类器
from
sklearn
.
naive_bayes
import
GaussianNB
#使用高斯贝叶斯模型
iris
=
load_iris
(
)
#加载iris信息
data
=
iris
.
data
#iris的数据集
target
=
iris
.
target
#iris的种类
#使用train_test_split()函数将数据集分成用于训练的data和用于测试的data
data_train
,
data_test
,
target_train
,
target_test
=
train_test_split
(
data
,
target
,
test_size
=
0.3
,
random_state
=
0
)
#1.kNN算法分类
knn
=
KNeighborsClassifier
(
)
#调用knn分类器
knn
.
fit
(
data_train
,
target_train
)
#训练knn分类器
accurate_Knn
=
knn
.
score
(
data_test
,
target_test
,
sample_weight
=
None
)
#调用该对象的打分方法,计算出准确率
print
'KNN预测值:'
,
(
knn
.
predict
(
data_test
)
)
#预测值
print
'真实值:'
,
(
target_test
)
#真实值
print
'KNN输出训练集的准确率为:'
,
accurate_Knn
for
i
in
range
(
len
(
target_test
)
)
:
#因为数据是二维数组,所以要用for循环,也可以用reshape对二位数组进行变形
if
target_test
[
i
]
==
0
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'r'
)
#画散点图
elif
target_test
[
i
]
==
1
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'g'
)
else
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'b'
)
plt
.
title
(
"iris"
)
plt
.
xlabel
(
"ewidth"
)
plt
.
ylabel
(
"elength"
)
plt
.
show
(
)
pr
=
knn
.
predict
(
data_test
)
for
i
in
range
(
len
(
pr
)
)
:
if
pr
[
i
]
==
0
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'r'
)
elif
pr
[
i
]
==
1
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'g'
)
else
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'b'
)
plt
.
title
(
"iris-KNN"
)
plt
.
xlabel
(
"ewidth"
)
plt
.
ylabel
(
"elength"
)
plt
.
show
(
)
print
(
"\n\n"
)
#2.SVM算法分类
svm
=
SVC
(
kernel
=
'rbf'
,
gamma
=
0.1
,
decision_function_shape
=
'ovo'
,
C
=
0.8
)
#搭建模型,训练SVM分类器
svm
.
fit
(
data_train
,
target_train
)
#训练SVC
accurate_Svm
=
svm
.
score
(
data_train
,
target_train
)
print
'SVM预测值:'
,
(
svm
.
predict
(
data_test
)
)
#预测值
print
'真实值:'
,
(
target_test
)
#真实值
print
'SVM-输出训练集的准确率为:'
,
accurate_Svm
for
i
in
range
(
len
(
target_test
)
)
:
if
target_test
[
i
]
==
0
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'r'
)
elif
target_test
[
i
]
==
1
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'g'
)
else
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'b'
)
plt
.
title
(
"iris"
)
plt
.
xlabel
(
"ewidth"
)
plt
.
ylabel
(
"elength"
)
plt
.
show
(
)
pr
=
svm
.
predict
(
data_test
)
for
i
in
range
(
len
(
pr
)
)
:
if
pr
[
i
]
==
0
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'r'
)
elif
pr
[
i
]
==
1
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'g'
)
else
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'b'
)
plt
.
title
(
"iris-SVM"
)
plt
.
xlabel
(
"ewidth"
)
plt
.
ylabel
(
"elength"
)
plt
.
show
(
)
print
(
"\n\n"
)
#3.朴素贝叶斯算法分类
nb
=
GaussianNB
(
)
#设置分类器
nb
.
fit
(
data_train
,
target_train
)
accurate_Nb
=
nb
.
score
(
data_train
,
target_train
)
print
'NB预测值:'
,
(
nb
.
predict
(
data_test
)
)
#预测值
print
'真实值:'
,
(
target_test
)
#真实值
print
'NB-输出训练集的准确率为:'
,
accurate_Nb
for
i
in
range
(
len
(
target_test
)
)
:
if
target_test
[
i
]
==
0
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'r'
)
elif
target_test
[
i
]
==
1
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'g'
)
else
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'b'
)
plt
.
title
(
"iris"
)
plt
.
xlabel
(
"ewidth"
)
plt
.
ylabel
(
"elength"
)
plt
.
show
(
)
pr
=
nb
.
predict
(
data_test
)
for
i
in
range
(
len
(
pr
)
)
:
if
pr
[
i
]
==
0
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'r'
)
elif
pr
[
i
]
==
1
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'g'
)
else
:
plt
.
scatter
(
data_test
[
i
,
0
]
,
data_test
[
i
,
1
]
,
c
=
'b'
)
plt
.
title
(
"iris-NB"
)
plt
.
xlabel
(
"ewidth"
)
plt
.
ylabel
(
"elength"
)
plt
.
show
(
)
偷个懒,其他的运行结果就不贴了,都差不多。
结果分析:
对比三种算法的准确率我发现knn=0.97,svm=0.96,朴素贝叶斯=0.94
即在较少数据时knn>svm>朴素贝叶斯,进一步得出结论,较少数据时KNN和svm的分类效率较高,朴素贝叶斯的效率较低。
本人水平有限,如有问题欢迎大家不吝指正。