- 逻辑回归的推导过程:https://blog.csdn.net/ACM_hades/article/details/90448785
- 代码主要实现了下面公式: W k + 1 = W k + λ X ( Y − f W k ( X T ) ) W^{k+1}=W^k+λX(Y-f_{W^k } (X^T)) W k + 1 = W k + λ X ( Y − f W k ( X T ) )
- 数据集 :我们选择MNIST数据集进行实验,它包含各种手写数字(0-9)图片,图片大小28*28。MNIST数据集本身有10个类别,为了将其变成二分类问题我们进行如下处理:label等于0的继续等于0,label大于0改为1。这样就将十分类的数据改为二分类的数据。
-
特征选择
:可选择的特征有很多,包括:
- 自己提取特征
- 将整个图片作为特征向量
- HOG特征
- 我们将整个图片作为特征(784=28×28)。
import
time
import
numpy
as
np
import
pandas
as
pd
from
sklearn
.
model_selection
import
train_test_split
from
sklearn
.
metrics
import
accuracy_score
class
Logistic
:
def
__init__
(
self
,
feature_len
)
:
self
.
weights
=
np
.
ones
(
(
feature_len
,
1
)
)
def
model_function
(
self
,
X
)
:
W_X
=
np
.
matmul
(
X
,
self
.
weights
)
temp_1
=
(
W_X
>=
0
)
.
astype
(
np
.
float
)
temp_0
=
(
W_X
<
0
)
.
astype
(
np
.
float
)
resut_1
=
1.0
/
(
1
+
np
.
exp
(
-
temp_1
*
W_X
)
)
*
temp_1
#W_X为负数是,因为参数值inx很大时,exp(inx)可能会发生溢出,所以修改计算方式
resut_0
=
np
.
exp
(
temp_0
*
W_X
)
/
(
1
+
np
.
exp
(
temp_0
*
W_X
)
)
*
temp_0
return
resut_1
+
resut_0
def
train
(
self
,
Data
,
label
)
:
#训练
label
=
label
.
reshape
(
(
-
1
,
1
)
)
alpha
=
0.01
max_iter
=
500
for
i
in
range
(
max_iter
)
:
#迭代
pres
=
self
.
model_function
(
Data
)
error
=
label
-
pres
#预测值和标签值所形成的误差
self
.
weights
=
self
.
weights
+
alpha
*
np
.
matmul
(
Data
.
T
,
error
)
#权重的更新
def
predict
(
self
,
Data
)
:
return
self
.
model_function
(
Data
)
.
reshape
(
-
1
)
if
__name__
==
'__main__'
:
print
(
'Start read data'
)
S
=
time
.
time
(
)
raw_data
=
pd
.
read_csv
(
'./lihang_book_algorithm-master/data/train_binary.csv'
)
# 读取数据
data
=
raw_data
.
values
# 获取数据
print
(
"data shape:"
,
data
.
shape
)
imgs
=
data
[
:
,
1
:
]
labels
=
data
[
:
,
0
]
print
(
"imgs shape:"
,
imgs
.
shape
)
imgs
=
np
.
concatenate
(
(
imgs
,
np
.
ones
(
(
imgs
.
shape
[
0
]
,
1
)
)
)
,
axis
=
1
)
#拼接常数项
print
(
"imgs shape:"
,
imgs
.
shape
)
print
(
"labels shape:"
,
labels
.
shape
)
print
(
"label:"
,
list
(
set
(
labels
)
)
)
Model
=
Logistic
(
imgs
.
shape
[
-
1
]
)
# 选取 2/3 数据作为训练集, 1/3 数据作为测试集
train_features
,
test_features
,
train_labels
,
test_labels
=
train_test_split
(
imgs
,
labels
,
test_size
=
0.33
,
random_state
=
23323
)
print
(
"train data count :%d"
%
len
(
train_labels
)
)
print
(
"test data count :%d"
%
len
(
test_labels
)
)
print
(
'read data cost '
,
time
.
time
(
)
-
S
,
' second'
)
print
(
'Start training'
)
S
=
time
.
time
(
)
Model
.
train
(
train_features
,
train_labels
)
print
(
'training cost '
,
time
.
time
(
)
-
S
,
' second'
)
print
(
'Start predicting'
)
S
=
time
.
time
(
)
test_predict
=
Model
.
predict
(
test_features
)
print
(
'predicting cost '
,
time
.
time
(
)
-
S
,
' second'
)
score
=
accuracy_score
(
test_labels
,
test_predict
)
print
(
"The accruacy socre is "
,
score
)
结果:
Start read data
data shape: (42000, 785)
imgs_origin shape: (42000, 784)
imgs shape: (42000, 785)
labels shape: (42000,)
label: [0, 1]
train data count :28140
test data count :13860
read data cost 4.148890018463135 second
Start training
training cost 15.161401748657227 second
Start predicting
predicting cost 0.007978200912475586 second
The accruacy socre is 0.9892496392496393