王锋 2009年12月03日 星期四 22:24 | 4501次浏览 | 2条评论
一下是一个抓取 http://www.bidders.co.jp 信息的网站,现在把源代码贴出来希望大家多多指点。其中包含一个 配置文件没有列出。生成的是配置文件。
#-*-coding:utf-8-*-
from BeautifulSoup import BeautifulSoup
import urllib2
import re
import string
def get_links(url,id):
print url
html=urllib2.urlopen(url.strip())
soup=BeautifulSoup(html)
http="
http://www.bidders.co.jp
"
soup_div=soup.find('div',{"class":"mainlist_box_cup"})
soup_td=soup_div.findAll('td',colspan="2")
l= len(soup_td)
link=[]
for i in range(l):
http="
http://www.bidders.co.jp
"
linkstr=http+soup_td[i].a['href'].strip()
#linklist= list(word for word in linkstr.split("/"))
link.append(linkstr)
print link
return (link,id)
def judge_link(link,id):
while link!=[]:
print "judge_link"
linkv=[]
for slink in link:
print "slink=",slink, "id=",id
link_list =list(word for word in slink.split('/'))
print link_list
http=slink
id=id
if link_list[-2]=="list1":
text="text2"
print "text2"
write(text,http,id)
elif link_list[-2]=="categ":
linkv.append(http)
print linkv,"循环1"
tlink=[]
for linkc in linkv:
(clink,id)=get_links(linkc,id)
for word in clink:
tlink.append(word)
link=tlink
print link, "循环"
# text="text1"
# print "text1"
# write(text,http,id)
def write(text,http,id):
f1=open(text,'r')
lines= f1.readlines()
str=""
if lines==[]:
print "write1"
content = "%s|%s\n"%(http,id)
f=open(text,"a")
f.write(content)
f.close()
else:
content = "%s|%s\n"%(http,id)
f=open(text,"a")
f.write(content)
f.close()
def get_link(ctet):
f=open(ctet,'r')
lines=f.readlines()
f.close()
http=[]
url=[]
id=[]
#print lines
for line in lines :
temlist=list(word for word in str(line).split("|"))
#print temlist
url.append(temlist[0].strip())
id.append(temlist[1].strip())
http=[url,id]
print http
return http
def create_file():
f1=open("text1","w")
f2=open('text2',"w")
f1.close()
f2.close()
print "files have been created"
def sort():
create_file()
ctet='ctet'
http=get_link(ctet)
l=len(http[0])
for i in range(l):
url=http[0][i]
id=http[1][i]
m=0
while m<=5:
try:
print "first"
(links,id)=get_links(url,id)
judge_link(links,id)
break
except :
m=m+1
continue
d="done"
return d
def write_begin():
f=open("bidder_jp.xml","w")
f.close()
from xml.dom import minidom,Node
# def write_head():
# impl = minidom.getDOMImplementtation()
# root = dom.documentElement
# dom = impl.createDocument(None,'config',None)
doc =minidom.Document()
config=doc.createElement('config')
doc.appendChild(config)
parameters=doc.createElement("parameters")
config.appendChild(parameters)
categories=doc.createElement("categories")
config.appendChild(categories)
pa0=doc.createElement("parameter")
pa1=doc.createElement("parameter")
pa2=doc.createElement("parameter")
parameters.appendChild(pa0)
parameters.appendChild(pa1)
parameters.appendChild(pa2)
#print doc.toprettyxml(indent = " ")
pa0.setAttribute("name","name")
pa0.setAttribute("value","url")
pa1.setAttribute("name","sourceid")
pa1.setAttribute("value","4")
pa2.setAttribute("name","table")
pa2.setAttribute("value","Tables/ArticleAdd.xml")
#print doc.toprettyxml(indent = " ")
s=sort()
s = "done"
if s=="done":
f=open("text2","r")
for eachline in f.readlines():
print eachline
# exit(0)
url=list(word for word in eachline.split("|"))
#print url
#exit(0)
http=str(url[0].strip())
id=str(url[1].strip())
category=doc.createElement("category")
cat=doc.createElement("cat")
category.setAttribute("starturl", http)
cat.setAttribute("id",id)
categories.appendChild(category)
category.appendChild(cat)
print doc.toprettyxml(indent = " ")
f.close()
f1=open("bidder_jp.xml","w")
f1.write(doc.toprettyxml())
f1.close()
import codecs
'''writer = codecs.lookup("utf-8")[3](f1)
doc.toprettyxml().writexml(writer,"\t","\t","\n",encoding="utf-8")
writer.close()'''
def final_result():
print "finlal"
ctet="text1"
sec_http=get_link(ctet)
http=get_link(ctet)
l=len(http[0])
for i in range(l):
url=http[0][i]
id=http[1][i]
(links,id)=get_links(url,id)
judge_link(links,id)
if __name__=="__main__":
write_begin()
Zeuux © 2024
京ICP备05028076号
回复 peter 2009年12月05日 星期六 01:43
再次感谢
回复 王锋 2009年12月05日 星期六 09:56