2007年06月01日 星期五 09:57
ÓÃFirefoxµÄDownload All ²å¼þ´ÓÐÂÀ˶ÁÊéƵµÀÏÂÔØÁËÎÄÕ¡£ÏëÔÚTreo650Àï¿´´¿Îı¾¡£È»ºó¾ÍÓÃÁËÒÔÏÂÁ½¸öС³ÌÐò¡£ËãÊÇUnix˼ÏëµÄÒ»ÖÖÌåÏÖ°É¡£Ð¡¹¤¾ßÖ»×öÒ»¼þÊÂÇé¡£´ó¼Ò°Ñ×Ô¼ºµçÄÔÀïµÄС±¦±´ÄóöÀ´·ÖÏí°¡¡£»òÕßÔÚÍøÕ¾É϶àÒ»¸öWikiÒ³Ã棿 ##################### #html2txt.py ##################### from formatter import AbstractFormatter, NullWriter from htmllib import HTMLParser def _(str, in_encoder="gbk", out_encoder="utf8"): return unicode(str, in_encoder).encode(out_encoder) class myWriter(NullWriter): def __init__(self): NullWriter.__init__(self) self._bodyText = [] def send_flowing_data(self, str): self._bodyText.append(str) def _get_bodyText(self): return '\n'.join(self._bodyText) bodyText = property(_get_bodyText, None, None, 'plain text from body') class myHTMLParser(HTMLParser): def do_meta(self, attrs): self.metas = attrs def convertFile(filename): mywriter = myWriter() absformatter = AbstractFormatter(mywriter) parser = myHTMLParser(absformatter) parser.feed(open(filename).read()) return ( _(parser.title), parser.formatter.writer.bodyText ) import os import os.path OUTPUTDIR = "./txt" INPUTDIR = "." if __name__ == "__main__": if not os.path.exists(OUTPUTDIR): os.mkdir(OUTPUTDIR) for file in os.listdir(INPUTDIR): if file[-4:] == '.htm' or file[-5:] == '.html': print "Coverting", file, outfilename = os.path.splitext(file)[0] a, text = convertFile(file) outfilename = outfilename + '.txt' outfullname = os.path.join(OUTPUTDIR, outfilename) open(outfullname, "wt").write(text) print "Done!" ################################ #pickupcontent.py ################################ # -*- coding: utf-8 -*- import sys import glob import os import re sys.argv[1:] = [item for arg in sys.argv[1:] for item in glob.glob(arg)] startstr = u"^°ËÊ®".encode("gb2312") # article title endstr = u"^\[·µ»Ø".encode("gb2312") # tmp_start = re.compile(startstr) tmp_end = re.compile(endstr) for infile in sys.argv[1:]: # print infile f = open(infile,'r') #print f lines = f.readlines() fout = '' for index, line in enumerate(lines): if tmp_start.match(line): kstart = index if tmp_end.match(line): kend = index break f.close() fout = fout.join(lines[kstart:kend]) tmp = open('tmp','w') tmp.write(fout) tmp.close() os.remove(infile) os.rename('tmp',infile) -------------- 下一部分 -------------- Ò»¸öHTML¸½¼þ±»ÒƳý... URL: http://python.cn/pipermail/python-chinese/attachments/20070601/2155e424/attachment.htm
2007年06月01日 星期五 10:19
On 6/1/07, Ben Luo <benluo在gmail.com> wrote: > 用Firefox的Download All > 插件从新浪读书频道下载了文章。想在Treo650里看纯文本。然后就用了以下两个小程序。算是Unix思想的一种体现吧。小工具只做一件事情。大家把自己电脑里的小宝贝拿出来分享啊。或者在网站上多一个Wiki页面? > 在啄木鸟维基一直有个"微项目" 的页面收集大家平日随手解决的小需求的开心代码; 你的收集到快乐的六.一节这天的页面了! http://wiki.woodpecker.org.cn/moin/MicroProj/2007-06-01 感谢分享! 提示,进行注释,以便帮助新人快速理解你的技巧,思路呢 ;) > ##################### > #html2txt.py > ##################### > > from formatter import AbstractFormatter, NullWriter > from htmllib import HTMLParser > > def _(str, in_encoder="gbk", out_encoder="utf8"): > return unicode(str, in_encoder).encode(out_encoder) > > > class myWriter(NullWriter): > def __init__(self): > NullWriter.__init__(self) > self._bodyText = [] > > def send_flowing_data(self, str): > self._bodyText.append(str) > > def _get_bodyText(self): > return '\n'.join(self._bodyText) > > bodyText = property(_get_bodyText, None, None, 'plain text from body') > > class myHTMLParser(HTMLParser): > def do_meta(self, attrs): > self.metas = attrs > > def convertFile(filename): > mywriter = myWriter() > absformatter = AbstractFormatter(mywriter) > parser = myHTMLParser(absformatter) > parser.feed(open(filename).read()) > return ( _(parser.title), > parser.formatter.writer.bodyText ) > > import os > import os.path > > OUTPUTDIR = "./txt" > INPUTDIR = "." > if __name__ == "__main__": > if not os.path.exists(OUTPUTDIR): > os.mkdir(OUTPUTDIR) > > for file in os.listdir(INPUTDIR): > if file[-4:] == '.htm' or file[-5:] == '.html': > print "Coverting", file, > outfilename = os.path.splitext(file)[0] > a, text = convertFile(file) > outfilename = outfilename + '.txt' > outfullname = os.path.join(OUTPUTDIR, outfilename) > open(outfullname, "wt").write(text) > print "Done!" > > ################################ > #pickupcontent.py > ################################ > > # -*- coding: utf-8 -*- > > import sys > import glob > import os > import re > > sys.argv[1:] = [item for arg in sys.argv[1:] for item in glob.glob(arg)] > startstr = u"^八十".encode("gb2312") # article title > endstr = u"^\[返回".encode("gb2312") # > tmp_start = re.compile(startstr) > tmp_end = re.compile(endstr) > for infile in sys.argv[1:]: > # print infile > f = open(infile,'r') > #print f > lines = f.readlines() > fout = '' > for index, line in enumerate(lines): > if tmp_start.match(line): > kstart = index > if tmp_end.match(line): > kend = index > break > > f.close() > fout = fout.join(lines[kstart:kend]) > tmp = open('tmp','w') > tmp.write(fout) > tmp.close() > os.remove(infile) > os.rename('tmp',infile) > > > > _______________________________________________ > python-chinese > Post: send python-chinese在lists.python.cn > Subscribe: send subscribe to > python-chinese-request在lists.python.cn > Unsubscribe: send unsubscribe to > python-chinese-request在lists.python.cn > Detail Info: > http://python.cn/mailman/listinfo/python-chinese > -- '''Time is unimportant, only life important! http://zoomquiet.org blog在http://blog.zoomquiet.org/pyblosxom/ wiki在http://wiki.woodpecker.org.cn/moin/ZoomQuiet scrap在http://floss.zoomquiet.org douban在http://www.douban.com/people/zoomq/ ____________________________________ Pls. use OpenOffice.org to replace M$ Office. http://zh.openoffice.org Pls. use 7-zip to replace WinRAR/WinZip. http://7-zip.org/zh-cn/ You can get the truely Freedom 4 software. '''
2007年06月01日 星期五 11:07
On 6/1/07, Zoom. Quiet <zoom.quiet在gmail.com> wrote: > > On 6/1/07, Ben Luo <benluo在gmail.com> wrote: > > ÓÃFirefoxµÄDownload All > > > ²å¼þ´ÓÐÂÀ˶ÁÊéƵµÀÏÂÔØÁËÎÄÕ¡£ÏëÔÚTreo650Àï¿´´¿Îı¾¡£È»ºó¾ÍÓÃÁËÒÔÏÂÁ½¸öС³ÌÐò¡£ËãÊÇUnix˼ÏëµÄÒ»ÖÖÌåÏÖ°É¡£Ð¡¹¤¾ßÖ»×öÒ»¼þÊÂÇé¡£´ó¼Ò°Ñ×Ô¼ºµçÄÔÀïµÄС±¦±´ÄóöÀ´·ÖÏí°¡¡£»òÕßÔÚÍøÕ¾É϶àÒ»¸öWikiÒ³Ã棿 > > > ÔÚ×ÄľÄñά»ùÒ»Ö±Óиö"΢ÏîÄ¿" µÄÒ³ÃæÊÕ¼¯´ó¼ÒƽÈÕËæÊÖ½â¾öµÄСÐèÇóµÄ¿ªÐÄ´úÂë; > ÄãµÄÊÕ¼¯µ½¿ìÀÖµÄÁù.Ò»½ÚÕâÌìµÄÒ³ÃæÁË! > http://wiki.woodpecker.org.cn/moin/MicroProj/2007-06-01 > > ¸Ðл·ÖÏí! > > Ìáʾ,½øÐÐ×¢ÊÍ,ÒÔ±ã°ïÖúÐÂÈË¿ìËÙÀí½âÄãµÄ¼¼ÇÉ,˼·ÄØ ;) html2txt.py Ò²ÊÇ´ÓÍøÉÏÏÂÔصġ£Ò»¸öÖйúÅóÓÑдµÄ¡£ÎÒÖ»ÊÇÓã¬Ò²²»ÊǺÜÀí½â¡£ ÏÂÒ»¸öÎļþÎÒÔÚWikiÖÐÒÔºó×¢ÊÍ¡£ -------------- 下一部分 -------------- Ò»¸öHTML¸½¼þ±»ÒƳý... URL: http://python.cn/pipermail/python-chinese/attachments/20070601/f22cb5d4/attachment.htm
2007年06月01日 星期五 11:19
On 6/1/07, Ben Luo <benluo在gmail.com> wrote: > > > On 6/1/07, Zoom. Quiet <zoom.quiet在gmail.com> wrote: > > On 6/1/07, Ben Luo <benluo在gmail.com> wrote: > > > 用Firefox的Download All > > > > 插件从新浪读书频道下载了文章。想在Treo650里看纯文本。然后就用了以下两个小程序。算是Unix思想的一种体现吧。小工具只做一件事情。大家把自己电脑里的小宝贝拿出来分享啊。或者在网站上多一个Wiki页面? > > > > > 在啄木鸟维基一直有个"微项目" 的页面收集大家平日随手解决的小需求的开心代码; > > 你的收集到快乐的六.一节这天的页面了! > > http://wiki.woodpecker.org.cn/moin/MicroProj/2007-06-01 > > > > 感谢分享! > > > > 提示,进行注释,以便帮助新人快速理解你的技巧,思路呢 ;) > > html2txt.py 也是从网上下载的。一个中国朋友写的。我只是用,也不是很理解。 咔咔咔!!在使用中谅解,在沟通中深入,技艺就是这么获得的 ;) > 下一个文件我在Wiki中以后注释。 > > > > _______________________________________________ > python-chinese > Post: send python-chinese在lists.python.cn > Subscribe: send subscribe to > python-chinese-request在lists.python.cn > Unsubscribe: send unsubscribe to > python-chinese-request在lists.python.cn > Detail Info: > http://python.cn/mailman/listinfo/python-chinese > -- '''Time is unimportant, only life important! http://zoomquiet.org blog在http://blog.zoomquiet.org/pyblosxom/ wiki在http://wiki.woodpecker.org.cn/moin/ZoomQuiet scrap在http://floss.zoomquiet.org douban在http://www.douban.com/people/zoomq/ ____________________________________ Pls. use OpenOffice.org to replace M$ Office. http://zh.openoffice.org Pls. use 7-zip to replace WinRAR/WinZip. http://7-zip.org/zh-cn/ You can get the truely Freedom 4 software. '''
Zeuux © 2025
京ICP备05028076号