背景
有一个工作邮箱,会接收许多人不断地投递的简历。由于邮件数量比较多,因此产生了一个需求。要求自动将邮件从邮件服务器取回到本地,并将邮件的基本信息存入本地的sqlite数据库。邮件的正文以单独文件的形式存放在文件夹下。
实现
备注:在python2.7下测试运行正常,如果用python3,可能需要对代码稍做修改。
1,邮件配置参数文件
mail.conf
[
mail163
]
#此外应写上你实际的帐号与密码
user
=
xxxx@
163.
com
password
=
xxxxx
pop3_server
=
pop3
.
163.
com
[
sqlite
]
dir
=
sqlite
fileName
=
mailLog
.
db
2.sqlite数据表的结构
3.从邮件服务器收取邮件的python代码
mailManager.py
# -*- coding:utf-8 -*-
# 读取邮件并解码存入日志数据库
import
poplib
import
email
import
ConfigParser
import
os
,
sys
,
string
,
time
from
email
.
parser
import
Parser
from
email
.
header
import
decode_header
from
email
.
utils
import
parseaddr
from
logHelper
import
LogHelper
#获取解码后的邮件体
def
getBody
(
msg
,
guessedCharset
=
'gb2312'
)
:
bodyText
=
''
if
(
msg
.
is_multipart
(
)
)
:
parts
=
msg
.
get_payload
(
)
for
n
,
part
in
enumerate
(
parts
)
:
try
:
bodyText
+=
getBody
(
part
)
except
UnicodeDecodeError
,
e
:
print
e
.
message
else
:
content_type
=
msg
.
get_content_type
(
)
if
content_type
==
'text/plain'
or
content_type
==
'text/html'
:
content
=
msg
.
get_payload
(
decode
=
True
)
#尝试进行解码
bodyText
=
decodeString
(
content
,
guessedCharset
,
'body'
,
guessedCharset
,
)
else
:
bodyText
=
''
return
bodyText
#解码邮件头中包含的字符串
def
decode_strInHeader
(
s
,
guessedCharset
=
'gb2312'
)
:
#对邮件头字符串,获取其内容与编码格式
value
,
charset
=
decode_header
(
s
)
[
0
]
#print(value,charset)
return
decodeString
(
value
,
charset
,
'header'
,
guessedCharset
)
#解码字符串
def
decodeString
(
s
,
charset
,
extra
=
'header'
,
guessedCharset
=
'gb2312'
)
:
value
=
s
if
charset
is
None
:
charset
=
guessedCharset
if
charset
:
#去除编码格式中可能存在的干扰元素,常见的是双引号
charset
=
charset
.
strip
(
'"'
)
charset
=
charset
.
strip
(
"'"
)
try
:
value
=
value
.
decode
(
charset
)
except
:
if
(
charset
==
'gb2312'
)
:
#尝试用比gb2312更大的字符集gbk进行解码
try
:
value
=
value
.
decode
(
'gbk'
)
except
:
print
(
"decode error in decodeString!"
,
'gbk'
,
extra
)
elif
(
charset
==
'utf8'
)
:
#尝试忽略掉解码错误
try
:
value
=
value
.
decode
(
'utf8'
,
errors
=
'ignore'
)
except
:
print
(
"decode error in decodeString!"
,
'gbk'
,
extra
)
else
:
#从目前解码邮件的实践来看,如果不是gb231编码,就是utf-8编码
print
(
"decode error in decodeString!"
,
charset
,
extra
)
return
value
#获得msg的编码,猜测编码格式
def
guess_charset
(
msg
)
:
charset
=
msg
.
get_charset
(
)
if
charset
is
None
:
content_type
=
msg
.
get
(
'Content-Type'
,
''
)
.
lower
(
)
pos
=
content_type
.
find
(
'charset='
)
if
pos
>=
0
:
charset
=
content_type
[
pos
+
8
:
]
.
strip
(
)
return
charset
#当前日期字符串
def
today
(
)
:
return
time
.
strftime
(
"%Y-%m-%d"
,
time
.
localtime
(
)
)
#确保文件夹存在
def
ensureDir
(
dir
)
:
if
not
os
.
path
.
exists
(
dir
)
:
os
.
mkdir
(
dir
)
#登记一封邮件
def
logOneMail
(
server
,
index
,
dir
,
logHelper
,
parseScope
=
'new'
)
:
print
(
'log Mail:'
,
index
)
resp
,
lines
,
octets
=
server
.
retr
(
index
)
# lines存储了邮件的原始文本的每一行,合并得到原始文本
msgRaw
=
b
'\r\n'
.
join
(
lines
)
#创建message对象,这个时候也会做基本的解码,得到message结构体
msg
=
email
.
message_from_string
(
msgRaw
)
#在需要时,可输出整个message结构体,观察有哪些键值对
#print msg
#推测邮件的编码格式
guessedCharset
=
guess_charset
(
msg
)
#如果subject存在就返回相应的值,否则返回''
subjectRaw
=
msg
.
get
(
"subject"
,
''
)
subject
=
decode_strInHeader
(
subjectRaw
,
guessedCharset
)
#print subject
fromAddrRaw
=
msg
.
get
(
"from"
,
''
)
var1
,
var2
=
parseaddr
(
fromAddrRaw
)
fromAddr
=
decode_strInHeader
(
var1
,
guessedCharset
)
#print fromAddr
toAddrRaw
=
msg
.
get
(
"to"
,
''
)
var1
,
var2
=
parseaddr
(
toAddrRaw
)
toAddr
=
decode_strInHeader
(
var1
,
guessedCharset
)
#print toAddr
messageIDRaw
=
msg
.
get
(
"Message-ID"
,
''
)
;
messageID
=
decode_strInHeader
(
messageIDRaw
,
guessedCharset
)
print
(
'mail message id:'
,
messageID
)
uniqueIDRaw
=
msg
.
get
(
"uniqueid"
,
''
)
;
uniqueID
=
decode_strInHeader
(
uniqueIDRaw
,
guessedCharset
)
#print uniqueID
dateStrRaw
=
msg
.
get
(
"Date"
,
''
)
;
dateStr
=
decode_strInHeader
(
dateStrRaw
,
guessedCharset
)
#print dateStr
#将邮件主体内容写入文件
baseName
=
messageID
.
strip
(
)
;
baseName
=
baseName
.
replace
(
'<'
,
''
)
baseName
=
baseName
.
replace
(
'>'
,
''
)
#以日期为文件夹,存放邮件正文
curDir
=
dir
+
'/'
+
today
(
)
+
'/'
ensureDir
(
curDir
)
contentFile
=
curDir
+
'/'
+
baseName
+
'.html'
if
not
os
.
path
.
exists
(
contentFile
)
:
outFile
=
open
(
contentFile
,
'w'
)
outFile
.
write
(
getBody
(
msg
,
guessedCharset
)
)
outFile
.
close
(
)
#检查是否到了解析范围的结尾处
if
parseScope
==
'new'
and
logHelper
.
msgExists
(
messageID
)
:
return
'scopeEnd'
#将邮件信息写入日志数据库
logHelper
.
append
(
messageID
,
fromAddr
,
subject
,
contentFile
,
dateStr
)
return
'ok'
#登记邮件,从邮件服务器中取出最近的一些邮件,
#parseScope='all',则取出所有的邮件,'new',取出新收到的邮件,或者取出只定数量的新邮件
#progressKey是批操作计数器的标识值,目前保留备用
def
logTheMails
(
progressKey
,
parseScope
=
'new'
)
:
#读取配置文件
cf
=
ConfigParser
.
ConfigParser
(
)
cf
.
read
(
"mail.conf"
)
user
=
cf
.
get
(
"mail163"
,
"user"
)
password
=
cf
.
get
(
"mail163"
,
"password"
)
pop3_server
=
cf
.
get
(
"mail163"
,
"pop3_server"
)
# 连接到POP3服务器:
server
=
poplib
.
POP3
(
pop3_server
)
# 可以打开或关闭调试信息:
#server.set_debuglevel(1)
# 打印POP3服务器的欢迎消息:
#print(server.getwelcome())
# 身份认证:
server
.
user
(
user
)
server
.
pass_
(
password
)
#stat()返回邮件数量和占用空间:
#print('Messages: %s. Size: %s' % server.stat())
#连接日志数据库
dbFileFullName
=
cf
.
get
(
"sqlite"
,
"dir"
)
+
'/'
+
cf
.
get
(
"sqlite"
,
"fileName"
)
logHelper
=
LogHelper
(
dbFileFullName
)
# list()返回所有邮件的编号:
resp
,
mails
,
octets
=
server
.
list
(
)
#邮件服务器邮箱中的邮件总数
total
=
len
(
mails
)
if
parseScope
==
'all'
:
logCount
=
total
elif
parseScope
==
'new'
:
logCount
=
total
else
:
logCount
=
int
(
parseScope
)
# 获取最近的 logCount 份邮件, 注意索引号从1开始,最新的索引是len(mails):
receivedCount
=
0
for
indexAsc
in
range
(
0
,
logCount
)
:
index
=
total
-
indexAsc
#登记一封邮件
flag
=
logOneMail
(
server
,
index
,
cf
.
get
(
"sqlite"
,
"dir"
)
,
logHelper
,
parseScope
)
if
flag
==
'scopeEnd'
:
break
receivedCount
+=
1
# 关闭到邮件服务器的连接:
server
.
quit
(
)
return
receivedCount
#登记邮件,取出起始索引号与结束索引号之间的一些邮件
def
logMailsByIndex
(
beginIndex
,
endIndex
)
:
#读取配置文件
cf
=
ConfigParser
.
ConfigParser
(
)
cf
.
read
(
"mail.conf"
)
user
=
cf
.
get
(
"mail163"
,
"user"
)
password
=
cf
.
get
(
"mail163"
,
"password"
)
pop3_server
=
cf
.
get
(
"mail163"
,
"pop3_server"
)
# 连接到POP3服务器:
server
=
poplib
.
POP3
(
pop3_server
)
# 可以打开或关闭调试信息:
#server.set_debuglevel(1)
# 打印POP3服务器的欢迎消息:
#print(server.getwelcome())
# 身份认证:
server
.
user
(
user
)
server
.
pass_
(
password
)
#stat()返回邮件数量和占用空间:
#print('Messages: %s. Size: %s' % server.stat())
#连接日志数据库
dbFileFullName
=
cf
.
get
(
"sqlite"
,
"dir"
)
+
'/'
+
cf
.
get
(
"sqlite"
,
"fileName"
)
logHelper
=
LogHelper
(
dbFileFullName
)
# list()返回所有邮件的编号:
resp
,
mails
,
octets
=
server
.
list
(
)
#邮件服务器邮箱中的邮件总数
total
=
len
(
mails
)
if
beginIndex
>
total
:
beginIndex
=
total
if
endIndex
>
total
:
endIndex
=
total
# 获取最近的 logCount 份邮件, 注意索引号从1开始,最新的索引是len(mails):
receivedCount
=
0
for
index
in
range
(
beginIndex
,
endIndex
+
1
)
:
#登记一封邮件
flag
=
logOneMail
(
server
,
index
,
cf
.
get
(
"sqlite"
,
"dir"
)
,
logHelper
)
if
flag
==
'scopeEnd'
:
break
receivedCount
+=
1
# 关闭到邮件服务器的连接:
server
.
quit
(
)
return
receivedCount
4.根据命令行参数,读取指定时间范围内的邮件的代码
fetchMails.py
# -*- coding:utf-8 -*-
#读取邮件
import
os
,
sys
,
string
import
time
import
getopt
import
mailManager
reload
(
sys
)
sys
.
setdefaultencoding
(
"utf-8"
)
#解析命令行参数,得到进度计数器的key和邮箱代号(留作备用)
#scope指示是解析全部邮件(all)还是只解析新收到的邮件(new)
#如果给出一个数字,则解析最近收到的指定数目的邮件
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:
]
,
'p:m:s:'
,
[
'progKey='
,
'mailBoxIdx='
,
'scope='
]
)
except
getopt
.
GetoptError
:
print
(
'error:'
,
'options invalid'
)
sys
.
exit
(
)
progressKey
=
''
parseScope
=
'new'
for
k
,
v
in
opts
:
if
k
in
(
"-p"
,
"--progKey"
)
:
progressKey
=
v
elif
k
in
(
"-m"
,
"--mailBoxIdx"
)
:
mailBoxIndex
=
int
(
v
)
elif
k
in
(
"-s"
,
"--scope"
)
:
parseScope
=
v
print
(
'oldCwd:'
,
os
.
getcwd
(
)
)
#将工作目录切换到当前文件所在的目录
os
.
chdir
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)
)
)
print
(
'newCwd:'
,
os
.
getcwd
(
)
)
print
print
(
'fetch mails : begin...'
)
print
startTime
=
time
.
time
(
)
if
progressKey
==
''
:
progressKey
=
'tempKey1'
#取回邮件并登记到sqlite数据库
receivedCount
=
mailManager
.
logTheMails
(
progressKey
,
parseScope
)
print
print
(
'receivedCount:'
,
receivedCount
)
print
endTime
=
time
.
time
(
)
print
(
'used time/minutes: '
,
(
endTime
-
startTime
)
/
60
)