年前,我刚学Python的那阵子,先后两次换不同邮箱申请订阅 python-chinese 邮件列表,但两次都徒劳了。每次都是除了系统自动发给我一封确认信外(我回了),再也没啥消息了。python-chinese的可供下载的版本是gZip形式的,下载后使用rar解压不了,看了看winrar的版本,太古老了,是3.3的,还没有支持这种格式。后来下载了最新的winrar3.7.1绿色版,也未能解压,提示是一样的,说压缩文件已损坏,没有找到压缩文件云云。
       也罢,不让我订阅,也不解压下载提供的gZip版本,还不能让我下载过去的邮件归档么?其实我也只是想学习一下而已。昨晚完善好下载newsmth上Myphoto版照片的程序后,又看了好一会newsmth Python版的帖子,大概是扫完了。今天又写了一个新的脚本,去下载python-chinese 的邮件归档,下载完毕后做成电子书学习一下。
python-chinese 的邮件归档的html格式比较简单,很好抓取。源代码如下,就当是学习Python的习作了。代码考虑到了网络IO读取失败的可能性,下载时没有使用多线程,只是挨个下载,就让机器慢慢工作吧。
#!/usr/bin/python
# -*- coding: cp936 -*-
# Filename: gethtml.py
import urllib
import os
from sgmllib import SGMLParser
#to parse http://python.cn/pipermail/python-chinese/
class URLLister(SGMLParser):
def reset(self):
    SGMLParser.reset(self)
  
    self.urls = []
    self.titles = []         
    self.isBody = False
def start_tr(self,attrs):
    self.isBody = True
def start_a(self, attrs):
    if self.isBody:
     href = [v for k, v in attrs if k == 'href']
     if href:
      self.urls.extend(href)
      self.isBody = False
class MailLister(SGMLParser):
def reset(self):
    SGMLParser.reset(self)
  
    self.mailUrls = []        
    self.isBody = False
    self.times = 0
    self.inLi = False
def start_p(self,attrs):
    if self.times == 0:
     self.isBody = True
     self.times = 1
def start_li(self,attrs):
    if self.isBody:
     self.inLi = True
    else:
     self.inLi = False
def start_a(self, attrs):
    href = [v for k, v in attrs if k == 'name']
    if href:
     if href[0] == "end":
      self.isBody = False
      self.inLi = False
  
    if self.inLi:
     href = [v for k, v in attrs if k == 'href']
     if href:
      self.mailUrls.extend(href)
      self.inLi = False
def getAllPaperMail(baseUrl = "http://python.cn/pipermail/python-chinese/"):
sock = urllib.urlopen(baseUrl)
htmlSource = sock.read()
sock.close()
parser = URLLister()
parser.feed(htmlSource)
for url in parser.urls:
    indexurl = baseUrl + url
    myList = url.split('/')
    path = myList[0]
    path = path.replace(' ',"")
    filename = myList[1]
    newurl = baseUrl + path
    if not os.path.exists(path):
     os.mkdir(path)
    filename = path + '/' + filename
  
    #download index page
    urllib.urlretrieve(indexurl, filename)
    #get papermail
    print "parse url= ",newurl
    getPaperMail(newurl,path)
    winsound.Beep(783,200)
  
def getPaperMail(baseUrl,path):
isOK = False
try1 = 0
try2 = 0
htmlSource = ""
while try1 < 6 and isOK == False:
    try:
     sock = urllib.urlopen(baseUrl)
     htmlSource = sock.read()
     sock.close()
    except:
     try2 = try2+1
     pass
    if try2 > try1:
     try1 = try1 + 1
    else:
     isOK = True
if isOK == False:
    print "get pageurl fail---",pageurl
    winsound.Beep(783,200)
    return
   
parser = MailLister()
parser.feed(htmlSource)
for url in parser.mailUrls:
    newurl = baseUrl + '/' + url
    filename = path + '/' + url
    #download each mail
    print "get: ",newurl
  
    isOK = False
    try1 = 0
    try2 = 0
    while try1 < 6 and isOK == False:
     try:
      urllib.urlretrieve(newurl, filename)
     except:
      try2 = try2+1
     if try2 > try1:
      try1 = try1 + 1
     else:
      isOK = True
if __name__ == "__main__":
print "Start."
url = r'http://python.cn/pipermail/python-chinese/'
getAllPaperMail(url)
print "The end."
...