2007年07月05日 星期四 15:31
南方周末是我一直以来坚持看的报纸之一,并且每周四傍晚南方报业会把文章发布在网站上,为了方便拷到手机上看,写了一个小脚本。 # down html from zm and save html to txt # -*- coding:utf-8 -*- import htmllib, formatter, urllib, re website = 'http://www.nanfangdaily.com.cn/zm/' f = urllib.urlopen(website) html = f.read().lower() i = html.find('url=') j = html.find('/',i+4) date = html[i+4:j] website += date f = urllib.urlopen(website) p = htmllib.HTMLParser(formatter.NullFormatter()) p.feed(f.read()) p.close() seen = set() for url in p.anchorlist: if url[-3::] == 'asp': if url in seen: continue seen.add(url) urls=list(seen) k=len(urls) doc=open(u'南方周末'.encode('gb18030')+date+'.txt','a') for l, url in enumerate(urls): f = urllib.urlopen(website+url[1:]) html = f.read() i = html.find('#ff0000') i = html.find('>',i+7) j = html.find('<',i+1) doc.write(html[i+1:j]) i = html.find('content01',j+1) i = html.find('>',i+9) j = html.find(']*>',re.IGNORECASE) doc.write(reobj.sub('\n',content)+'\n------------\n') print l+1,'-->',k doc.close() print u'下载结束' -------------- 下一部分 -------------- 一个HTML附件被移除... URL: http://python.cn/pipermail/python-chinese/attachments/20070705/e6dd9e58/attachment.htm -------------- 下一部分 -------------- # down html from zm and save html to txt # -*- coding:utf-8 -*- import htmllib, formatter, urllib, re website = 'http://www.nanfangdaily.com.cn/zm/' f = urllib.urlopen(website) html = f.read().lower() i = html.find('url=') j = html.find('/',i+4) date = html[i+4:j] website += date f = urllib.urlopen(website) p = htmllib.HTMLParser(formatter.NullFormatter()) p.feed(f.read()) p.close() seen = set() for url in p.anchorlist: if url[-3::] == 'asp': if url in seen: continue seen.add(url) urls=list(seen) k=len(urls) doc=open(u'������������'.encode('gb18030')+date+'.txt','a') for l, url in enumerate(urls): f = urllib.urlopen(website+url[1:]) html = f.read() i = html.find('#ff0000') i = html.find('>',i+7) j = html.find('<',i+1) doc.write(html[i+1:j]) i = html.find('content01',j+1) i = html.find('>',i+9) j = html.find(']*>',re.IGNORECASE) doc.write(reobj.sub('\n',content)+'\n------------\n') print l+1,'-->',k doc.close() print u'������������'
Zeuux © 2025
京ICP备05028076号