Python数据分析:数据特征分析
文章目录
- 1. 定量数据分布分析
- 极差、组距、频率分布表、频率分布直方图
- 不同样式的直方图
- 2. 定性数据分布分析
- 饼图、条形图
- 时间序列图:概述及时间格式转换
- 时间序列图:横坐标为时间的折线图、周期性分析
- 时间序列图:绘制不同样式的折线图
- 散点图:不同组别数据绘制在一幅图中
1. 定量数据分布分析
极差、组距、频率分布表、频率分布直方图
#定量数据的分布分析
import
pandas
as
pd
import
numpy
as
np
import
matplotlib
as
plt
data
=
pd
.
read_csv
(
'myexcel2.csv'
)
sale_data
=
data
[
'总额'
]
#print(sale_data)
#极差=最大值-最小值,调用np.ptp()求极差
jicha
=
sale_data
.
max
(
)
-
sale_data
.
min
(
)
print
(
jicha
,
np
.
ptp
(
sale_data
)
)
#决定分点, 分布区间表
gcut
=
pd
.
cut
(
sale_data
,
5
,
right
=
False
)
#等宽分成5组,right左区间闭合
data
[
'参考总价分组'
]
=
gcut
.
values
gcut_data
=
gcut
.
value_counts
(
sort
=
False
)
print
(
gcut_data
)
#列出频率分布表
p_l
=
pd
.
DataFrame
(
gcut_data
)
p_l
.
columns
=
[
'频数'
]
p_l
[
'频率'
]
=
p_l
/
p_l
[
'频数'
]
.
sum
(
)
#计算频率
p_l
[
'累计频率'
]
=
p_l
[
'频率'
]
.
cumsum
(
)
#计算累计频率
#百分比显示频率, 百分比显示累计频率
p_l
[
'频率%'
]
=
p_l
[
'频率'
]
.
map
(
lambda
x
:
"%.2f%%"
%
(
x
*
100
)
)
p_l
[
'累计频率%'
]
=
p_l
[
'累计频率'
]
.
map
(
lambda
x
:
"%.2f%%"
%
(
x
*
100
)
)
print
(
p_l
)
#绘制频率直方图
p_l
[
'频率'
]
.
plot
(
kind
=
'bar'
,
width
=
0.8
,
figsize
=
(
6
,
3
)
,
rot
=
25
,
color
=
'b'
,
grid
=
False
,
alpha
=
0.7
)
#添加文本
x
=
len
(
p_l
)
y
=
p_l
[
'频率'
]
m
=
p_l
[
'频数'
]
for
i
,
j
,
k
in
zip
(
range
(
x
)
,
y
,
m
)
:
plt
.
text
(
i
-
0.1
,
j
+
0.01
,
'%i'
%
k
,
color
=
'r'
)
plt
.
rcParams
[
'font.sans-serif'
]
=
[
'SimHei'
]
#用来显示中文
plt
.
show
(
)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
[
-
25.0
,
20.0
)
3
[
20.0
,
65.0
)
5
[
65.0
,
110.0
)
4
[
110.0
,
155.0
)
3
[
155.0
,
200.225
)
5
Name
:
总额
,
dtype
:
int64
频数 频率 累计频率 频率
%
累计频率
%
[
-
25.0
,
20.0
)
3
0.15
0.15
15.00
%
15.00
%
[
20.0
,
65.0
)
5
0.25
0.40
25.00
%
40.00
%
[
65.0
,
110.0
)
4
0.20
0.60
20.00
%
60.00
%
[
110.0
,
155.0
)
3
0.15
0.75
15.00
%
75.00
%
[
155.0
,
200.225
)
5
0.25
1.00
25.00
%
100.00
%
不同样式的直方图
import
pandas
as
pd
import
numpy
as
np
import
matplotlib
.
pyplot
as
plt
#产生1000个随机数,并分配给1000个日期值
data
=
pd
.
DataFrame
(
np
.
random
.
randn
(
1000
,
4
)
,
index
=
pd
.
date_range
(
'1/1/2000'
,
periods
=
1000
)
,
columns
=
list
(
'ABCD'
)
)
print
(
data
.
head
(
)
)
#data['A'].hist(alpha=0.7) #绘制某一列的直方图
#data.hist(bins=50,alpha=0.7) #多个直方图单独显示
#data.plot.hist(bins=50,alpha=0.7) #多个直方图堆叠
data
.
plot
.
hist
(
stacked
=
True
,
bins
=
50
,
alpha
=
0.7
)
#多个直方图叠加
plt
.
show
(
)
2. 定性数据分布分析
饼图
、
条形图
、
时间序列图
饼图、条形图
#定性分析:饼图
import
pandas
as
pd
import
numpy
as
np
import
matplotlib
.
pyplot
as
plt
data
=
pd
.
read_csv
(
'myexcel2.csv'
,
header
=
0
)
print
(
data
)
labels
=
[
'A'
,
'B'
,
'C'
,
'D'
]
share
=
[
422
,
262
,
365
,
824
]
colors
=
[
'red'
,
'yellowgreen'
,
'lightskyblue'
]
explode
=
(
0.05
,
0
,
0
,
0
)
#radius饼图半径,autopct比例,labeldistance文本的位置离远点有多远,explode某部分爆炸出来
#startangle起始角度 一般选择从90度开始比较好看,pctdistance text离圆心的距离
plt
.
pie
(
share
,
labels
=
labels
,
radius
=
0.8
,
autopct
=
'%3.1f%%'
,
pctdistance
=
0.6
,
colors
=
colors
,
startangle
=
180
,
labeldistance
=
1.1
,
explode
=
explode
,
shadow
=
True
)
plt
.
title
(
'菜品销售量分布'
)
#标题
#plt.rcParams['font.sans-serif'] = ['SimHei'] #用来显示中文
plt
.
show
(
)
#定性分析:条形图
import
pandas
as
pd
import
numpy
as
np
import
matplotlib
.
pyplot
as
plt
data
=
pd
.
read_csv
(
'myexcel1.csv'
,
header
=
0
)
oriention
=
data
[
'名称'
]
.
value_counts
(
)
#统计各类别的数量
print
(
len
(
oriention
)
)
#数目为5
print
(
oriention
)
#查看5个类别的数量
oriention
.
plot
(
kind
=
'barh'
,
color
=
'blue'
,
alpha
=
0.7
)
plt
.
title
(
'房屋朝向分析'
,
size
=
15
)
plt
.
xlabel
(
'房屋数量'
,
size
=
15
)
plt
.
ylabel
(
'房屋朝向'
,
size
=
15
)
plt
.
rcParams
[
'font.sans-serif'
]
=
[
u
'SimHei'
]
#用来显示中文
plt
.
rcParams
[
'axes.unicode_minus'
]
=
False
#用来正常显示负号
plt
.
show
(
)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
东
7
南
4
东南
4
西南
3
西
2
时间序列图:概述及时间格式转换
时间序列图:实现时间序列的可视化,及周期性的可视化。
导入到时间数据,默认的是字符串的数据类型。因此,在可视化的时候,会出现没有按时间先后顺序的方式绘图的状况。因此,需要将字符串解析为时间类型的数据类型。
将字符串解析为时间类型的 3 中方法:
1 在读取数据的时候,采用parse_dates=True,自动解析其中的时间数据
2 使用dateuyil包中的parser.parse解析时间字符串
3 利用pandas的to_datetime处理时间list
#使用 parser.parse 解析时间字符串
from
dateutil
.
parser
import
parse
v1
=
parse
(
'2018-09-02'
)
print
(
"解析后的时间格式为:"
,
v1
)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
#使用 pd.to_datetime 处理时间list
import
pandas
as
pd
datestrs
=
[
'2018/09/02'
,
'2018/09/03'
,
'2018/09/04'
]
print
(
pd
.
to_datetime
(
datestrs
)
)
时间序列图:横坐标为时间的折线图、周期性分析
import
numpy
as
np
import
pandas
as
pd
import
matplotlib
.
pyplot
as
plt
#从文件导入数据,采用parse_dates=True,自动解析其中的时间数据
#数据来源 https://www.cnblogs.com/yimengtianya1/p/9576431.html
data
=
pd
.
read_csv
(
'1213.csv'
,
parse_dates
=
True
,
index_col
=
0
)
print
(
data
.
head
(
)
)
#绘图-折线图
plt
.
plot
(
data
[
'销售额'
]
,
label
=
'销售额'
)
plt
.
xticks
(
rotation
=
50
)
#旋转横坐标的标签
plt
.
show
(
)
#绘图-周期性分析图
data
=
data
.
set_index
(
'星期'
)
#将列'星期'变成data的行索引
print
(
data
.
head
(
)
)
count
=
data
[
'销售额'
]
.
count
(
)
#统计行数:29
circle
=
count
//
7
#取整除,向下取整:4
print
(
count
,
circle
)
#每 7 天一次循环绘制折线图
for
i
in
range
(
circle
)
:
plt
.
plot
(
data
[
'销售额'
]
[
7
*
i
:
7
*
i
+
7
]
)
plt
.
show
(
)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
星期 销售额
日期
2018
-
08
-
01
Wed
4702986
2018
-
08
-
02
Thu
5034151
2018
-
08
-
03
Fri
5636981
2018
-
08
-
04
Sat
6377764
2018
-
08
-
05
Sun
6138548
销售额
星期
Wed
4702986
Thu
5034151
Fri
5636981
Sat
6377764
Sun
6138548
29
4
时间序列图:绘制不同样式的折线图
import
pandas
as
pd
import
numpy
as
np
import
matplotlib
.
pyplot
as
plt
#绘制折线图:产生1000个随机数,并分配给1000个日期值
data
=
pd
.
Series
(
np
.
random
.
randn
(
1000
)
,
index
=
pd
.
date_range
(
'1/1/2000'
,
periods
=
1000
)
)
# print(data.head())
data
=
data
.
cumsum
(
)
data
.
plot
(
)
plt
.
show
(
)
#整体绘制多维数组的折线图
df
=
pd
.
DataFrame
(
np
.
random
.
randn
(
1000
,
4
)
,
index
=
data
.
index
,
columns
=
list
(
'ABCD'
)
)
print
(
df
.
head
(
)
)
df
=
df
.
cumsum
(
)
df
.
plot
(
)
;
plt
.
legend
(
loc
=
'best'
)
;
plt
.
show
(
)
#单独绘制多维数组的折线图
df
.
plot
(
subplots
=
True
)
;
plt
.
legend
(
loc
=
'best'
)
;
plt
.
show
(
)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
A B C D
2000
-
01
-
01
-
0.410500
0.747189
0.448807
0.405169
2000
-
01
-
02
1.251914
-
0.079718
-
0.688459
1.887231
2000
-
01
-
03
0.885157
-
0.428284
0.494870
1.175288
2000
-
01
-
04
0.316397
-
1.934446
0.424298
-
1.753221
2000
-
01
-
05
-
0.079853
-
1.188323
-
1.034872
-
0.696540
import
pandas
as
pd
import
numpy
as
np
import
matplotlib
.
pyplot
as
plt
#以随机数B为横坐标,C为纵坐标绘图
data
=
pd
.
DataFrame
(
np
.
random
.
randn
(
1000
,
2
)
,
columns
=
[
'B'
,
'C'
]
)
.
cumsum
(
)
print
(
data
.
head
(
)
)
data
.
plot
(
x
=
'B'
,
y
=
'C'
)
plt
.
show
(
)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
B C
0
-
0.868427
0.269645
1
-
0.783871
-
2.097551
2
-
1.961644
-
1.483593
3
-
1.973470
-
1.421133
4
-
2.626177
-
1.129055
import
pandas
as
pd
import
numpy
as
np
import
matplotlib
.
pyplot
as
plt
#产生1000个随机数,并分配给1000个日期值
data
=
pd
.
DataFrame
(
np
.
random
.
randn
(
1000
,
4
)
,
index
=
pd
.
date_range
(
'1/1/2000'
,
periods
=
1000
)
,
columns
=
list
(
'ABCD'
)
)
data
=
data
.
cumsum
(
)
print
(
data
.
head
(
)
)
data
.
A
.
plot
(
x_compat
=
True
)
#使用'x_compat'函数对横坐标日期进行调整
data
.
plot
(
x_compat
=
True
)
#所有列进行绘制
data
.
plot
(
x_compat
=
True
,
subplots
=
True
)
#所有列单独绘制
plt
.
show
(
)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
A B C D
2000
-
01
-
01
0.636463
1.950795
-
0.661356
1.858671
2000
-
01
-
02
1.044353
2.750769
-
0.495825
0.211315
2000
-
01
-
03
1.280628
2.947151
-
0.139967
0.999238
2000
-
01
-
04
-
0.934980
3.316594
1.021859
-
0.119952
2000
-
01
-
05
-
2.030070
2.410148
1.590350
-
1.081503
散点图:不同组别数据绘制在一幅图中
import
pandas
as
pd
import
numpy
as
np
import
matplotlib
.
pyplot
as
plt
fig
=
plt
.
figure
(
)
ax
=
fig
.
add_subplot
(
111
)
data
=
pd
.
DataFrame
(
np
.
random
.
rand
(
50
,
4
)
,
columns
=
list
(
'abcd'
)
)
ax
.
scatter
(
x
=
data
[
'a'
]
,
y
=
data
[
'b'
]
,
color
=
'DarkBlue'
,
s
=
120
,
alpha
=
0.7
,
linewidths
=
None
,
label
=
'Group1'
)
ax
.
scatter
(
x
=
data
[
'c'
]
,
y
=
data
[
'd'
]
,
color
=
'Red'
,
s
=
120
,
alpha
=
0.7
,
linewidths
=
None
,
label
=
'Group2'
)
plt
.
legend
(
)
plt
.
show
(
)
print
import
pandas
as
pd
import
statsmodels
.
api
as
sm
import
matplotlib
.
pyplot
as
plt
data
=
sm
.
datasets
.
co2
.
load_pandas
(
)
co2
=
data
.
data
print
(
co2
.
head
(
5
)
)
co2
.
index
y
=
co2
[
'co2'
]
.
resample
(
'MS'
)
.
mean
(
)
y
.
head
(
5
)
y
[
'1990'
:
]
y
.
isnull
(
)
.
sum
(
)
y
=
y
.
fillna
(
y
.
bfill
(
)
)
y
.
isnull
(
)
.
sum
(
)
y
.
plot
(
figsize
=
(
12
,
6
)
)
plt
.
show
(
)
import
pandas
as
pd
import
statsmodels
.
api
as
sm
import
matplotlib
.
pyplot
as
plt
data
=
sm
.
datasets
.
co2
.
load_pandas
(
)
co2
=
data
.
data
co2
.
to_csv
(
data
)
#导出数据到CSV文件
import
os
os
.
getcwd
(
)
#获取当前工作路径
%
pwd
#获取当前工作路径
时间序列
https://blog.csdn.net/oh5w6hinug43jvrhhb/article/details/78360686
https://m.jb51.net/article/167032.htm
https://my.oschina.net/zhiyonghe/blog/906307
https://blog.csdn.net/hustqb/article/details/80717055
http://www.360doc.com/content/19/0708/06/39062348_847360415.shtml#
https://www.cnblogs.com/yimengtianya1/p/9576431.html
https://blog.csdn.net/hustqb/article/details/80722766
https://blog.csdn.net/qq_37635049/article/details/82012135
https://blog.csdn.net/weixin_39778570/article/details/81157040
https://blog.csdn.net/weixin_39778570/article/details/81157884
https://www.kaggle.com/datasets?search=stock