2007年07月03日 星期二 12:54
ÓÐÈËÔÚpythonÖÐʹÓùýlibxml2Âð£¬ÎÒ·¢ÏÖÿ´Î½âÎöÒ»¸öxmlÎļþ£¬¶¼»áÔö¼Ó4k×óÓÒµÄÐéÄâÄڴ棬³¤ÆÚʹÓã¬ÄÚ´æ²»¿°Öظº°¡¡£ÇëÎÊÓÐʲôºÃµÄ½â¾ö·½·¨Ã»£¿ ×¢£ºÊÇÔÚwindowsÏÂʹÓà ÏÂÃæÊDzâÊÔ´úÂ룺 #!/usr/bin/python -u import libxml2 #------------------------------------------------------------------------------ # Memory debug specific #------------------------------------------------------------------------------ def _escape(data): """Escape data for XML""" data=data.replace("&","&") data=data.replace("<","<") data=data.replace(">",">") data=data.replace("'","'") data=data.replace('"',""") return data class callback: def __init__(self): self._head = "" self._tail = "" self._current = "" self._level = 0 self._doc = None self._root = None def startDocument(self): return print "." def endElement(self, tag): return self._current+="" % (tag,) self._level -= 1 if self._level > 1: return if self._level==1: xml=self._head+self._current+self._tail doc=libxml2.parseDoc(xml) try: node = doc.getRootElement().children try: node1 = node.docCopyNode(self._doc, 1) try: pass #self._root.addChild(node1) #self._handler.stanza(self._doc, node1) except: node1.unlinkNode() node1.freeNode() del node1 pass finally: node1.unlinkNode() node1.freeNode() del node1 #del node finally: doc.freeDoc() else: print 'level:%d'%self._level xml=self._head+self._tail doc=libxml2.parseDoc(xml) try: #self._handler.stream_end(self._doc) self._doc.freeDoc() self._doc = None self._root = None finally: doc.freeDoc() def startElement(self, tag, attrs): return #print 'startElement_____________' s = "<"+tag if attrs: for a,v in attrs.items(): s+=" %s='%s'" % (a,_escape(v)) s += ">" if self._level == 0: self._head = s self._tail = "" % (tag,) xml=self._head+self._tail ## if self._doc: ## self._doc.freeDoc() ## self._doc=None self._doc = libxml2.parseDoc(xml) #self._handler.stream_start(self._doc) #self._root = self._doc.getRootElement() elif self._level == 1: self._current = s else: self._current += s self._level += 1 #print self._level def characters(self, data): pass def warning(self, msg): pass def error(self, msg): pass def fatalError(self, msg): pass #------------------------------------------------------------------------------ #------------------------------------------------------------------------------ import os import sys programName = os.path.basename(sys.argv[0]) if len(sys.argv) != 2: print "Use: %s" % programName sys.exit(1) inputPath = sys.argv[1] if not os.path.exists (inputPath): print "Error: directory does not exist" sys.exit(1) libxml2.debugMemory(1) inputFileNames = [] dirContent = os.listdir(inputPath) for fichero in dirContent: extension1=fichero.rfind(".htm") extension2=fichero.rfind(".html") dot = fichero.rfind(".") extension = max(extension1,extension2) if extension != -1 and extension == dot: inputFileNames.append (fichero) if len(inputFileNames) == 0: print "Error: no input files" sys.exit(1) handler = callback() NUM_ITERS = 20 isrun=True while isrun: for i in range(NUM_ITERS): for inputFileName in inputFileNames: ctxt = libxml2.createPushParser(handler, "", 0, inputFileName) #libxml2.initParser() #print inputFileName inputFilePath = inputPath + inputFileName f = open(inputFilePath) data = f.read() #print data f.close() ctxt.parseChunk(data, len(data), 1) #libxml2.pythonCleanupParser() #libxml2.cleanupParser() ctxt.clearParserCtxt() #ctxt = None print libxml2.memoryUsed() del ctxt ctxt = None s=raw_input('Quit?') isrun=s!='q' s=raw_input('Press any key...') # Memory debug specific libxml2.cleanupParser() if libxml2.debugMemory(1) == 0: print "OK" else: print "Memory leak %d bytes" % (libxml2.debugMemory(1)) libxml2.dumpMemory() -- python c# and opensource blog:http://www.chyni.cn -------------- 下一部分 -------------- Ò»¸öHTML¸½¼þ±»ÒƳý... URL: http://python.cn/pipermail/python-chinese/attachments/20070703/2d91522d/attachment.html
2007年07月03日 星期二 14:00
libxml2(http://xmlsoft.org/python.html) CÓïÑÔ°æLibxml2(Òµ½ç±ê×¼ÁË)µÄÒ»¸öpython·â×°£¬¾Ý˵Ëٶȷdz£¿ì. ¹¦Äܷdz£Ç¿£¬Ö§³Ö¼¸ºõËùÓеÄXML´¦ÀíÒªÇó¡£ °üÀ¨¶ÔRelax NGµÈµÄÖ§³Ö¡£ µ«Êǽӿڲ»¹»pythonic£¬ÐèÒª¿¼ÂÇÄÚ´æ´¦Àí£¬ ÔÚWindowsÉÏ»á³öÏÖÎ޹ʹҵôµÄÇé¿ö£¬²»Îȶ¨¡£ ------------------------------ from ÅË¿¡ÓµÄBlog <http://blog.czug.org/panjy> On 7/3/07, ×··çÖðÔ <chinesexu在gmail.com> wrote: > > > ÓÐÈËÔÚpythonÖÐʹÓùýlibxml2Âð£¬ÎÒ·¢ÏÖÿ´Î½âÎöÒ»¸öxmlÎļþ£¬¶¼»áÔö¼Ó4k×óÓÒµÄÐéÄâÄڴ棬³¤ÆÚʹÓã¬ÄÚ´æ²»¿°Öظº°¡¡£ÇëÎÊÓÐʲôºÃµÄ½â¾ö·½·¨Ã»£¿ > ×¢£ºÊÇÔÚwindowsÏÂʹÓà > > > ÏÂÃæÊDzâÊÔ´úÂ룺 > #!/usr/bin/python -u > import libxml2 > > #------------------------------------------------------------------------------ > > > > # Memory debug specific > > > > #------------------------------------------------------------------------------ > > def _escape(data): > """Escape data for XML""" > data=data.replace("&","&") > data=data.replace("<","<") > data=data.replace(">",">") > data=data.replace ("'","'") > data=data.replace('"',""") > return data > class callback: > def __init__(self): > self._head = "" > self._tail = "" > self._current = "" > self._level = 0 > self._doc = None > self._root = None > > def startDocument(self): > return > print "." > > def endElement(self, tag): > return > self._current+="" % (tag,) > self._level -= 1 > if self._level > 1: > return > if self._level==1: > xml=self._head+self._current+self._tail > doc=libxml2.parseDoc(xml) > try: > node = doc.getRootElement ().children > try: > node1 = node.docCopyNode(self._doc, 1) > try: > pass > #self._root.addChild(node1) > #self._handler.stanza(self._doc, node1) > except: > node1.unlinkNode() > node1.freeNode() > del node1 > pass > finally: > node1.unlinkNode() > node1.freeNode() > del node1 > #del node > finally: > doc.freeDoc() > else: > print 'level:%d'%self._level > xml=self._head+self._tail > doc=libxml2.parseDoc (xml) > try: > #self._handler.stream_end(self._doc) > self._doc.freeDoc() > self._doc = None > self._root = None > finally: > doc.freeDoc() > > def startElement(self, tag, attrs): > return > #print 'startElement_____________' > s = "<"+tag > if attrs: > for a,v in attrs.items(): > s+=" %s='%s'" % (a,_escape(v)) > s += ">" > if self._level == 0: > self._head = s > self._tail = "" % (tag,) > xml=self._head+self._tail > ## if self._doc: > ## self._doc.freeDoc() > ## self._doc=None > > self._doc = libxml2.parseDoc(xml) > #self._handler.stream_start(self._doc) > #self._root = self._doc.getRootElement() > elif self._level == 1: > self._current = s > else: > self._current += s > self._level += 1 > #print self._level > > > > > def characters(self, data): > pass > > def warning(self, msg): > pass > > def error(self, msg): > pass > > def fatalError(self, msg): > pass > > > #------------------------------------------------------------------------------ > #------------------------------------------------------------------------------ > > > import os > import sys > > programName = os.path.basename(sys.argv[0]) > > if len(sys.argv) != 2: > print "Use: %s" % programName > sys.exit(1) > > inputPath = sys.argv [1] > > if not os.path.exists (inputPath): > print "Error: directory does not exist" > sys.exit(1) > > libxml2.debugMemory(1) > > inputFileNames = [] > dirContent = os.listdir(inputPath) > for fichero in dirContent: > extension1=fichero.rfind(".htm") > extension2=fichero.rfind(".html") > dot = fichero.rfind(".") > extension = max(extension1,extension2) > if extension != -1 and extension == dot: > inputFileNames.append (fichero) > > if len(inputFileNames) == 0: > print "Error: no input files" > sys.exit(1) > > > handler = callback() > NUM_ITERS = 20 > isrun=True > while isrun: > for i in range(NUM_ITERS): > for inputFileName in inputFileNames: > ctxt = libxml2.createPushParser(handler, "", 0, inputFileName) > #libxml2.initParser() > #print inputFileName > inputFilePath = inputPath + inputFileName > f = open(inputFilePath) > data = f.read() > #print data > f.close() > > > ctxt.parseChunk(data, len(data), 1) > #libxml2.pythonCleanupParser() > #libxml2.cleanupParser() > ctxt.clearParserCtxt() > #ctxt = None > print libxml2.memoryUsed() > del ctxt > ctxt = None > > s=raw_input('Quit?') > isrun=s!='q' > s=raw_input('Press any key...') > > > # Memory debug specific > libxml2.cleanupParser() > if libxml2.debugMemory(1) == 0: > print "OK" > else: > print "Memory leak %d bytes" % ( libxml2.debugMemory(1)) > libxml2.dumpMemory() > > -- > python c# and opensource > blog:http://www.chyni.cn > _______________________________________________ > python-chinese > Post: send python-chinese在lists.python.cn > Subscribe: send subscribe to python-chinese-request在lists.python.cn > Unsubscribe: send unsubscribe to python-chinese-request在lists.python.cn > Detail Info: http://python.cn/mailman/listinfo/python-chinese > -- ÎÒ×ßµ½Ò»¸öÄ°ÉúµÄµØ·½, ¸æËß±ðÈË ÎÒҪȥÁ÷ÀË Å¶£¬ÎÒҪȥÁÆÉË¡¡ Gtalk: iexper(at)gmail.com ÓòÃû¹ýÆÚÁË -------------- 下一部分 -------------- Ò»¸öHTML¸½¼þ±»ÒƳý... URL: http://python.cn/pipermail/python-chinese/attachments/20070703/e5755163/attachment-0001.html
Zeuux © 2025
京ICP备05028076号