最近在处理文件时需要将一行文本按照一定的分隔符分成多行。如
ABC123,1,2,1!#2,3,4!#5!#6
ABC123是ID,后面的是属性,按‘,’分割的第4列,第6列都有不同的属性,用!#分割,如4!#5!#6,我想要拆成
ABC123,1,2,1,3,4
ABC123,,,2,,5
ABC123,,,,,6
直接上代码:
# -*- coding:utf-8 -*-
import
sys
import
time
# 原始文件
inputfile
=
sys
.
argv
[
1
]
#第一层分隔符,如','
split1
=
sys
.
argv
[
2
]
#第二层分隔符,如'!#'
split2
=
sys
.
argv
[
3
]
#输出文件的分隔符
Osplit
=
sys
.
argv
[
4
]
#打开文件,f为输入;o为log;r为输出
f
=
open
(
inputfile
,
'r+'
)
o
=
open
(
'log.txt'
,
'w+'
)
r
=
open
(
inputfile
+
"_output"
,
'w+'
)
#写下开始时间
o
.
write
(
"["
+
time
.
strftime
(
"%Y-%m-%d %H:%M:%S"
,
time
.
localtime
(
)
)
+
"]Start to Process: "
+
inputfile
+
"\n"
)
all
=
0
#按行处理
for
(
num
,
i
)
in
enumerate
(
f
)
:
ListLine
=
i
.
strip
(
'\n'
)
.
split
(
split1
)
#去除换行符,按照分隔符1拆成一个数组
ListField
=
[
]
for
s1
in
range
(
len
(
ListLine
)
)
:
#按照分隔符2拆成一个二维数组
ListField
.
append
(
ListLine
[
s1
]
.
split
(
split2
)
)
FileMax
=
0
for
s
in
range
(
1
,
len
(
ListField
)
)
:
#获取某列最多属性的个数
if
(
FileMax
<
len
(
ListField
[
s
]
)
)
:
FileMax
=
len
(
ListField
[
s
]
)
for
s2
in
range
(
FileMax
)
:
#打印文件
r
.
write
(
ListField
[
0
]
[
0
]
+
Osplit
)
for
s
in
range
(
1
,
len
(
ListField
)
)
:
try
:
if
(
s
==
len
(
ListField
)
-
1
)
:
r
.
write
(
ListField
[
s
]
[
s2
]
)
else
:
r
.
write
(
ListField
[
s
]
[
s2
]
+
Osplit
)
except
IndexError
:
if
(
s
==
len
(
ListField
)
-
1
)
:
r
.
write
(
""
)
else
:
r
.
write
(
Osplit
)
r
.
write
(
"\n"
)
all
+=
FileMax
if
(
(
num
+
1
)
%
1000
==
0
)
:
#每处理1000行,打印日志,目的是看下处理大文件时看处理了多少行
o
.
write
(
"["
+
time
.
strftime
(
"%Y-%m-%d %H:%M:%S"
,
time
.
localtime
(
)
)
+
"]Processed "
+
str
(
num
+
1
)
+
" inputfile Line! Generated "
+
str
(
all
)
+
" outputfile Line!"
+
"\n"
)
o
.
write
(
"["
+
time
.
strftime
(
"%Y-%m-%d %H:%M:%S"
,
time
.
localtime
(
)
)
+
"]Finished Processing: "
+
inputfile
+
"\n"
)
f
.
close
(
)
o
.
close
(
)
r
.
close
(
)
执行方法:控制台
python ./array.py array.txt
","
"!#"
"|"
实测70万条记录的文件拆成了1700+万条的文件。
笔记本CPU:Intel® Core™ i7-6600U CPU @ 2.60GHz 2.81GHz,用了将近半小时
服务器CPU:Intel® Xeon® CPU E5-2686 v4 @ 2.30GHz,用了10分钟
然后用shell分文件起了多个脚本(研究了半天python多线程,放弃了)。时间缩短到6分钟。
main
(
)
{
if
[
$
# -eq 4 ];then
echo
`
date
+%Y-%m-%d:%H:%M:%S
`
Start
!
threadNum
=
4
split
-n l/
${threadNum}
-d
$1
$1_
for
((
i
=
0
;
i
<
threadNum
;
i
++
))
do
python /data/Operation/iPinYou/data_apend_select/output_appended_attributes_data/array3.py
${1}
_0
${i}
$2
$3
$4
&
done
wait
cat
${1}
_*_output
>
${1}
_output
echo
`
date
+%Y-%m-%d:%H:%M:%S
`
Finished
!
else
echo
"Process Failed, Need Arguments[Input File, split 1, split 2, output split]."
exit
-1
fi
}
main
$@