python常用导入函数及其他操作备忘录
- python常用导入函数
- 解压缩zip并读取csv文件
- 查看缺失值
- 权重系数取绝对值后排序(查看特征权重重要度)
python常用导入函数
from
IPython
.
display
import
display
import
numpy
as
np
import
pandas
as
pd
from
pandas
import
Series
,
DataFrame
from
PIL
import
Image
import
matplotlib
.
pyplot
as
plt
%
matplotlib inline
plt
.
rcParams
[
'font.sans-serif'
]
=
[
'SimHei'
]
#用来正常显示中文标签
plt
.
rcParams
[
'axes.unicode_minus'
]
=
False
#用来正常显示负号
%
config ZMQInteractiveShell
.
ast_node_interactivity
=
'all'
# nootbook使用
from
scipy
import
interp
# 线性插值
from
selenium
import
webdriver
# 我的环境变量没有配置成功,每次都要调用路径的Chromedriver
path
=
"D:/box/chromedriver_win32/chromedriver"
browser
=
webdriver
.
Chrome
(
executable_path
=
path
,
options
=
webdriver
.
ChromeOptions
(
)
)
browser
.
get
(
'http://www.baidu.com'
)
# 数据集拆分为训练集和测试集
from
sklearn
.
model_selection
import
train_test_split
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.3
)
# 标准化数据,使每个维度的特征数据均值为0,方差为1
from
sklearn
.
preprocessing
import
StandardScaler
s
=
StandardScaler
(
)
x_train
=
s
.
fit_transform
(
X_train
)
x_test
=
s
.
transform
(
X_test
)
# 前面fit后,后面只需要transform即可
# 使用LogisticRegression建模
from
sklearn
.
linear_model
import
LogisticRegression
lr
=
LogisticRegression
(
)
lr
.
fit
(
x_train
,
y_train
)
解压缩zip并读取csv文件
import
pandas
as
pd
pd
.
set_option
(
'display.max_columns'
,
500
)
# 显示最大列数,如果超出,省略号表示
import
zipfile
with
zipfile
.
ZipFile
(
'KaggleCredit2.zip'
,
'r'
)
as
z
:
f
=
z
.
open
(
'KaggleCredit2.csv'
)
data
=
pd
.
read_csv
(
f
,
index_col
=
0
)
# index_col=0表示不设置索引列,以默认数字0,1,2,3...
data
.
head
(
)
查看缺失值
data
.
isnull
(
)
# 缺失值判断:是缺失值返回True,否则范围False
data
.
isnull
(
)
.
sum
(
axis
=
0
)
# 缺失值计算:返回每列包含的缺失值的个数
data
.
dropna
(
)
# 缺失值删除:直接删除含有缺失值的行
data
.
dropna
(
inplace
=
True
)
# 删除缺失值,并且用删除之后的数据替换掉原数据
data
.
dropna
(
axis
=
1
)
# 缺失值删除列:直接删除含有缺失值的列
data
.
dropna
(
how
=
'all'
)
# 缺失值删除行:只删除全是缺失值的行
data
.
dropna
(
thresh
=
n
)
# 缺失值删除判断:保留至少有n个缺失值的行
data
.
dropna
(
subset
=
[
'C'
]
)
# 缺失值删除列:删除含有缺失值的特定的列
权重系数取绝对值后排序(查看特征权重重要度)
# 各个特征的权重系数
pd
.
Series
(
lr
.
coef_
[
0
]
,
index
=
X
.
columns
)
# cls.coef_[0]一维数组,否则会出错
# 取绝对值并排序
pd
.
Series
(
np
.
abs
(
lr
.
coef_
[
0
]
)
,
index
=
X
.
columns
)
.
sort_values
(
ascending
=
False
)
# 降序排列