随机森林算法python实现
- 瞎BB
- 代码
- 导入数据
- 切分训练集测试集
- 找到最有用的几个属性
- 根据上面的代码更改属性
- 参数组合遍历找最优
- 随机森林
- 样本数据
瞎BB
1.实现根据样本数据(用眼距离distance、最长持续用眼时长duration、总用眼时长total_time、户外运动时长outdoor、用眼角度angle、健康环境光照用眼比例proportion)判别是否需要近视预警
2.样本实在太少,结果还行,原理都是一样的
代码
导入数据
import
pandas
patients
=
pandas
.
read_csv
(
"data.csv"
)
patients
.
head
(
5
)
切分训练集测试集
from
sklearn
.
model_selection
import
train_test_split
patients_data
=
patients
.
loc
[
:
,
'distance'
:
'proportion'
]
patients_target
=
patients
.
loc
[
:
,
'warning'
]
data_train
,
data_test
,
target_train
,
target_test
=
train_test_split
(
patients_data
,
patients_target
,
test_size
=
0.1
,
random_state
=
42
)
找到最有用的几个属性
import
numpy
as
np
from
sklearn
.
feature_selection
import
SelectKBest
,
f_classif
import
matplotlib
.
pyplot
as
plt
predictors
=
[
"distance"
,
"duration"
,
"total_time"
,
"outdoor"
,
"angle"
,
"proportion"
]
selector
=
SelectKBest
(
f_classif
,
k
=
5
)
selector
.
fit
(
data_train
,
target_train
)
scores
=
-
np
.
log10
(
selector
.
pvalues_
)
plt
.
bar
(
range
(
len
(
predictors
)
)
,
scores
)
plt
.
xticks
(
range
(
len
(
predictors
)
)
,
predictors
,
rotation
=
'vertical'
)
plt
.
show
(
)
根据上面的代码更改属性
predictors_best
=
[
"distance"
,
"total_time"
,
"angle"
,
"proportion"
]
data_train
=
data_train
[
predictors_best
]
data_test
=
data_test
[
predictors_best
]
参数组合遍历找最优
from
sklearn
.
model_selection
import
GridSearchCV
tree_param_grid
=
{
'min_samples_split'
:
list
(
(
2
,
3
,
4
)
)
,
'n_estimators'
:
list
(
(
3
,
5
,
10
,
15
,
20
,
25
,
30
,
35
,
40
,
45
,
50
)
)
}
grid
=
GridSearchCV
(
RandomForestClassifier
(
)
,
param_grid
=
tree_param_grid
,
cv
=
kf
)
#(算法,调节参数(用字典形式),交叉验证次数)
grid
.
fit
(
data_train
,
target_train
)
#训练集
grid
.
cv_results_
,
grid
.
best_params_
,
grid
.
best_score_
#得分,最优参数,最优得分
随机森林
from
sklearn
import
model_selection
from
sklearn
.
ensemble
import
RandomForestClassifier
rf
=
RandomForestClassifier
(
random_state
=
1
,
n_estimators
=
35
,
min_samples_split
=
2
,
min_samples_leaf
=
2
)
#交叉验证
kf
=
model_selection
.
KFold
(
n_splits
=
3
)
scores
=
model_selection
.
cross_val_score
(
rf
,
data_train
,
target_train
,
cv
=
kf
)
print
(
scores
.
mean
(
)
)
样本数据
sample | distance | duration | total_time | outdoor | angle | proportion | warning(1 yes 0 no) |
---|---|---|---|---|---|---|---|
1 | 20 | 72 | 344 | 148 | 11 | 81 | 1 |
2 | 34 | 68 | 263 | 135 | 7 | 50 | 1 |
3 | 25 | 98 | 357 | 32 | 12 | 64 | 1 |
4 | 37 | 65 | 291 | 157 | 8 | 89 | 0 |
5 | 34 | 151 | 162 | 169 | 18 | 63 | 1 |
6 | 30 | 178 | 259 | 146 | 32 | 50 | 1 |
7 | 20 | 35 | 134 | 37 | 23 | 68 | 0 |
8 | 39 | 111 | 169 | 87 | 4 | 52 | 0 |
9 | 22 | 44 | 265 | 136 | 14 | 76 | 1 |
10 | 39 | 151 | 219 | 140 | 2 | 55 | 0 |
11 | 21 | 179 | 184 | 64 | 18 | 60 | 1 |
12 | 25 | 41 | 241 | 71 | 16 | 72 | 1 |
13 | 18 | 171 | 286 | 131 | 35 | 89 | 1 |
14 | 32 | 33 | 236 | 102 | 29 | 50 | 1 |
15 | 20 | 133 | 226 | 124 | 17 | 81 | 1 |
16 | 17 | 148 | 236 | 66 | 32 | 75 | 1 |
17 | 34 | 111 | 214 | 57 | 5 | 88 | 0 |
18 | 24 | 85 | 163 | 155 | 14 | 32 | 1 |
19 | 32 | 165 | 276 | 146 | 33 | 52 | 1 |
20 | 25 | 124 | 359 | 171 | 33 | 70 | 0 |
21 | 31 | 51 | 167 | 47 | 25 | 47 | 0 |
22 | 31 | 63 | 352 | 58 | 22 | 44 | 1 |
23 | 16 | 58 | 164 | 45 | 13 | 73 | 0 |
24 | 29 | 37 | 326 | 104 | 33 | 68 | 1 |
25 | 34 | 47 | 197 | 59 | 5 | 66 | 0 |
26 | 36 | 123 | 185 | 165 | 26 | 70 | 0 |
27 | 25 | 126 | 171 | 45 | 23 | 33 | 1 |
28 | 31 | 84 | 98 | 37 | 30 | 51 | 1 |
29 | 30 | 92 | 153 | 114 | 14 | 48 | 0 |
30 | 29 | 178 | 278 | 146 | 27 | 45 | 1 |