Differences between revisions 10 and 16 (spanning 6 versions)
Revision 10 as of 2008-01-30 06:26:53
Size: 7393
Editor: zuroc
Comment:
Revision 16 as of 2009-04-11 16:28:34
Size: 16686
Editor: zuroc
Comment:
Deletions are marked like this. Additions are marked like this.
Line 3: Line 3:
'''
含有章节索引的 *PUG 文章通用模板
'''
::-- ["zuroc"] [[[DateTime(2008-01-23T08:30:45Z)]]]
[[TableOfContents]]
## 默许导航,请保留
[[Include(CPUGnav)]]


= ZSPY =
''用python抓取网络''

代码见
http://zspy.googlecode.com

写作中....
代码见 http://zspy.googlecode.com
Line 24: Line 9:
== 第一天PycURL ==

Pycurl  http://pycurl.sourceforge.net/


外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度.
用于做网络爬虫,抓网页.


http://pycurl.sourceforge.net/download/
下载
pycurl-ssl-7.16.4.win32-py2.5.exe
安装.
== PycURL ==
Pycurl http://pycurl.sourceforge.net/

外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.

 http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.
Line 70: Line 48:
Line 74: Line 51:
Line 86: Line 64:
    
Line 91: Line 69:
        
Line 97: Line 75:
        
Line 107: Line 85:
    
Line 112: Line 90:
             
Line 147: Line 125:
        
Line 154: Line 132:
Line 156: Line 133:
Line 157: Line 135:
#!/usr/bin/env python
zuroc@aragorn ~/wgetbot/zget $ cat test.py
Line 162: Line 141:
Line 172: Line 150:
    
    def __init__(self,):
        super(UrlOpen,self).__init__()

    def __init__(self, ):
        super(UrlOpen, self).__init__()
Line 176: Line 154:
        self.handle_list=[]
        self.waiting=[]

    def add(self,url,recall,catch=None,writer=StringIO()):
        self.handle_list = []
        self.waiting = []

    def add(self, url, recall, catch=None, writer=StringIO):
Line 184: Line 162:
            def catch(curl,error_no,desp):
                #print "Error:%s - %s"%(error_no,desp)
                pass
            def catch(curl, error_no, desp):
                print "Url:%s\nError:%s - %s"%(curl.url, error_no, desp)
Line 191: Line 168:
        c.url=url
        c.content = writer
        c.url = url
        c.content = writer()
Line 194: Line 171:
        c.catch=catch         c.catch = catch
Line 198: Line 175:
        c.setopt(c.WRITEFUNCTION,c.content.write)
        
        c.setopt(c.WRITEFUNCTION, c.content.write)
        c.setopt(pycurl.CONNECTTIMEOUT, 30)
        c.setopt(pycurl.MAXREDIRS, 3)
        c.setopt(pycurl.TIMEOUT, 300)
        c.setopt(pycurl.FOLLOWLOCATION, 1)
Line 201: Line 182:
    
Line 203: Line 184:
        waiting=self.waiting[:]
        self.waiting=[]
        waiting = self.waiting[:]
        self.waiting = []
Line 208: Line 189:
        
    def _remove(self,c):

    def _remove(self, c):
Line 213: Line 194:
        
    
        del c
Line 218: Line 199:
        num_handle=len(self.handle_list)
       count=1
        num_handle = 0
Line 221: Line 201:
            #print 1
Line 222: Line 203:
                #print "select start"
Line 223: Line 205:
                #print "select end"
Line 225: Line 208:
                        num_handle_pre=num_handle
                        ret, num_handle =self.opener.perform()
                        #print "perform start"
num_handle_pre = num_handle
                        ret, num_handle = self.opener.perform()
                        #print "preform end"
Line 228: Line 213:
                        if num_handle!=num_handle_pre:
                            result=self.opener.info_read()
                        if num_handle != num_handle_pre:
                            result = self.opener.info_read()
Line 233: Line 218:
                                i.recall(i)
Line 234: Line 220:
                                i.recall(i)
Line 241: Line 226:
                            #print "break"
Line 243: Line 229:
                #print "sleep"
Line 244: Line 231:
                #print "sleep end"
Line 246: Line 235:
_opener=None
def urlopen(*arg,**key):
_opener = None
def urlopen(*arg, **key):
Line 250: Line 239:
        _opener=UrlOpen()         _opener = UrlOpen()
Line 252: Line 241:
    _opener.add(*arg,**key)

if __name__=="__main__":
    def show(x):
        print x.content.getvalue()
        print '--'*11
    urlopen("http://www.baidu.com/",show)
    urlopen("http://www.google.com/",show)
    urlopen("http://www.sougou.com/",show)
    urlopen("http://www.yodao.com/",show)
    urlopen("http://www.yahoo.com/",show)
    urlopen("http://www.msn.com/",show)
    _opener.add(*arg, **key)
import time
if __name__ == "__main__":
    link = ['http://www.baidu.com/', 'http://www.sina.com.cn', 'http://www.qq.com', 'http://www.sohu.com', 'http://www.163.com/', 'http://www.ifeng.com/', 'http://www.cctv.com/default.shtml', 'http://www.xinhuanet.com/', 'http://www.people.com.cn/', 'http://cn.msn.com/', 'http://www.google.cn/', 'http://cn.yahoo.com/', 'http://www.amazon.cn/?source=2009hao123famousdaohang', 'http://www.chinamobile.com/', 'http://www.pconline.com.cn/', 'http://www.chinahr.com/', 'http://www.gov.cn/', 'http://www.zhcw.com/', 'http://www.autohome.com.cn/', 'http://www.zhaopin.com/Market/hao123.jsp', 'http://fund.eastmoney.com/', 'http://www.eastmoney.com/', 'http://www.xiaonei.com/', 'http://www.soufun.com/', 'http://www.51.com/', 'http://www.rayli.com.cn/', 'http://youa.baidu.com/', 'http://www.360.cn/', 'http://www.ctrip.com/', 'http://www.xcar.com.cn/', 'http://www.qq163.com', 'http://www.samsung.com/', 'http://www.zol.com.cn/', 'http://www.taobao.com/', 'http://www.icbc.com.cn/', 'http://www.sto.cn', 'http://www.dianping.com', 'http://www.gougou.com', 'http://www.ct10000.com', 'http://www.anjuke.com/?&pi=H-1', 'http://www.360buy.com/union/union_default.asp?union_Id=75', 'http://tl.sohu.com/?rcc_id=061f93406c7a77d6a6e4c8647b09fb56', 'http://www.51job.com/default.php?code=gb2312', 'http://central.dangdang.com/league/leagueref.asp?from=P-227107a&backurl=http://a.oadz.com/link/C/51/52648/ZzZIg.TXwwIV69FJbh3yJe4H7WI_/a/898?home.dangdang.com', 'http://www.jiayuan.com/st/?id=3237&url=http://www.jiayuan.com']
    link +=['http://www.qidian.com/', 'http://www.readnovel.com/', 'http://www.hongxiu.com/', 'http://www.bookge.com/', 'http://www.jjwxc.net/', 'http://hjsm.tom.com/', 'http://www.4yt.net/', 'http://www.cuiweiju.com/', 'http://book.sina.com.cn/', 'http://www.xxsy.net/', 'http://www.wansong.net/', 'http://www.myfreshnet.com/', 'http://www.fmx.cn/', 'http://www.xs8.cn/', 'http://www.rongshuxia.com/', 'http://www.booksky.org/', 'http://www.zhulang.com/', 'http://www.3320.net/', 'http://www.17k.com/', 'http://www.xhsd.net/', 'http://www.qukanshu.com/', 'http://www.fbook.net/', 'http://www.duyidu.com/', 'http://www.soso999.com/', 'http://www.junzitang.com/', 'http://msn.hongxiu.com/', 'http://www.yuanwen.com/', 'http://top.baidu.com/book.html', 'http://www.lcread.com/', 'http://www.sodu.com.cn/', 'http://www.cc222.com/', 'http://www.feiku.com/', 'http://book.hqdoor.com/', 'http://book.sooyuu.com/', 'http://www.52eshu.com/', 'http://bbs.91txt.com/', 'http://book.qq.com/', 'http://book.sohu.com/', 'http://www.baidu.com/search/guoxue/dir/fenlei.html', 'http://wind.yinsha.com/', 'http://www.duzhe.com/', 'http://www.storychina.cn/', 'http://www.shigeku.org/', 'http://www.goodmood.cn/', 'http://www.nlc.gov.cn/', 'http://www.qnwz.cn/', 'http://wenxue.xilu.com/']
    link +=['http://www.ganji.com/', 'http://www.58.com/', 'http://www.baixing.com/', 'http://www.263.com/', 'http://www.kuxun.cn/', 'http://www.mangocity.com/', 'http://www.qunar.com/', 'http://www.dianping.com/', 'http://www.fantong.com/', 'http://www.55bbs.com/', 'http://www.19lou.com/', 'http://www.koubei.com/', 'http://www.nike.com.cn/', 'http://www.li-ning.com.cn/', 'http://www.bosideng.com/', 'http://www.pirateship.com.cn/', 'http://www.goelia.com.cn/', 'http://www.adidas.com/', 'http://www.converse.com.cn/', 'http://www.romon.com/index.php', 'http://www.youngor.com/', 'http://www.etam.com.cn', 'http://www.heilanhome.com/', 'http://www.mizuno.com.cn/', 'http://www.goldlion-china.com/', 'http://www.phland.com.cn/', 'http://www.betu.com.hk/', 'http://www.puma.com.cn/', 'http://www.anta.com/', 'http://www.pierrecardin.com.cn/', 'http://www.bobdog.com.cn/', 'http://www.idaphne.com/', 'http://www.e-giordano.com/', 'http://www.361sport.com/', 'http://www.levi.com.cn/', 'http://www.lee.com.cn/', 'http://www.shanshan.com/', 'http://www.semir.com', 'http://www.versace.com/flash.html', 'http://www.k-boxing.com/', 'http://only.nzn.cn/', 'http://www.pb89.com/%20', 'http://www.aimer.com.cn/', 'http://www.balenciaga.com', 'http://www.ordifen.com.cn/', 'http://www.ochirly.com/', 'http://www.uggaustralia.com/', 'http://www.jshyx.com/', 'http://www.givenchy.com/default.php', 'http://www.thenorthface.com.cn/', 'http://www.tissot.com.hk/', 'http://www.azona.com.hk/', 'http://www.3suisses.com.cn/', 'http://www.valentino.it/', 'http://www.yishion.com.cn/', 'http://www.chowtaiseng.com/', 'http://www.tsljewellery.com/', 'http://www.jeanswest.com/', 'http://www.baoxiniao.com.cn/', 'http://www.qsyr.com/%20', 'http://www.septwolves.com/', 'http://www.baleno.com.hk/', 'http://www.belle.com.cn/', 'http://www.teenmix.com.cn/', 'http://www.fairwhale.com.cn/', 'http://www.swatch.com.cn/', 'http://www.staccato.com/', 'http://www.daphne.com.cn/', 'http://www.c-banner.com/', 'http://www.xtep.com.cn/', 'http://www1.jeanswest.com.cn/', 'http://www.kappa.com.cn/', 'http://www.laofengxiang.com/', 'http://www.cnhqt.com/', 'http://www.tatashoes.com.cn/', 'http://www.robinhood.com.cn/', 'http://www.doublestar.com.cn/', 'http://www.ozarkgear.com.cn/', 'http://www.aokang.com.cn/', 'http://www.ctf.com.cn/', 'http://www.crpttan.com/', 'http://www.calvinklein.com/', 'http://www.citizen.com.cn/', 'http://www.longines.com/', 'http://www.jackjonescn.net/', 'http://www.famoustone.com/', 'http://www.kfc.com.cn/', 'http://www.bjyoshinoya.com.cn/', 'http://www.starbucks.cn/', 'http://www.icoke.cn/', 'http://www.mengniu.com.cn/', 'http://www.mcdonalds.com.cn/', 'http://www.yonghe.com.cn/', 'http://www.ubccn.com/', 'http://www.dicos.com.cn/', 'http://www.yili.com/', 'http://www.pizzahut.com.cn/', 'http://www.quanjude.com.cn/direct.php', 'http://www.nescafe.com.cn/', 'http://www.masterkong.com.cn/', 'http://www.heinz.com.cn/', 'http://www.origus.com/', 'http://www.xfy.com.cn/', 'http://www.haagendazs.com.cn/', 'http://www.wyeth.com.cn/', 'http://www.moutaichina.com/index.asp', 'http://www.tsingtao.com.cn/', 'http://www.meadjohnson.com.cn/', 'http://www.dumex.com.cn/', 'http://www.wuliangye.com.cn/', 'http://www.zkungfu.com/', 'http://www.dovechocolate.com.cn/', 'http://www.ganso.com.cn/%20%20%20', 'http://www.beingmate.com/', 'http://www.waffleboy.com.cn/', 'http://www.holiland.com.cn/', 'http://www.goldenjaguar.com/', 'http://www.huiyuan.com.cn/%20%20%20', 'http://www.hsufuchifoods.com/%20%20%20%20', 'http://www.maybellinechina.com/', 'http://www.dabao.com/', 'http://www.lorealchina.com/', 'http://www.shiseidochina.com/', 'http://www.esteelauder.com.cn/', 'http://www.avon.com.cn/PRSuite/home/home.jsp', 'http://www.tjoy.biz/', 'http://www.lancome.com.cn/_zh/_cn/index.aspx', 'http://www.kose.co.jp/', 'http://www.h2oplus.com.hk/', 'http://www.yuesai.com.cn/', 'http://www.nivea.com.cn/', 'http://www.chanel.com/', 'http://www.clinique.com.cn/index.tmpl?ngextredir=1', 'http://www.ponds.com.cn/', 'http://www.vichy.com.cn/', 'http://www.efu.org.cn/', 'http://www.laneigechina.com/Front-Page/index2.jsp', 'http://www.olay.com.cn/', 'http://www.guerlain.com.cn/', 'http://www.aupres-shiseido.com.cn/', 'http://www.dior.com/pcd/International/JSP/Home/prehomeFlash.jsp', 'http://www.herborist.com.cn/', 'http://www.dhc.net.cn/', 'http://www.ysl.com/', 'http://www.kose.com.cn/', 'http://www.liangfei.com/', 'http://www.tayoi.com/', 'http://www.chcedo.com/', 'http://www.head-shoulders.com.cn/', 'http://www.slek.com.cn/', 'http://www.mentholatum.com.cn/', 'http://www.pg.com.cn/', 'http://www.china-ajjj.com/', 'http://www.rejoice.com.cn/', 'http://www.cnnice.com/', 'http://www.watsons.com.cn/', 'http://www.unilever.com.cn/', 'http://www.ikea.com/cn/zh/', 'http://www.pantene.com.cn/', 'http://www.colgate.com.cn/app/Colgate/CN/HomePage.cvsp', 'http://www.auchan.com.cn/', 'http://www.c-bons.com.cn/', 'http://www.carrefour.com.cn/', 'http://www.vs.com.cn/', 'http://www.crest.com.cn/', 'http://www.tongrentang.com/', 'http://www.amway.com.cn/index.aspx', 'http://www.wal-martchina.com/', 'http://www.tupperware.com.cn/', 'http://www.ourlotus.com/', 'http://www.skyworth.com/cn/', 'http://www.sony.com.cn/', 'http://www.siemens.com.cn/', 'http://www.gree.com.cn/', 'http://www.shinco.com/', 'http://www.midea.com.cn/', 'http://www.samsung.com.cn/', 'http://www.hitachi-shha.com.cn/', 'http://www.electrolux.com.cn/', 'http://www.toshiba.com.cn/', 'http://www.panasonic.com.cn/', 'http://www.canon.com.cn/', 'http://www.tcl.com/', 'http://www.lg.com.cn/', 'http://cn.changhong.com/', 'http://www.haier.com/', 'http://www.philips.com.cn/', 'http://www.konka.com/', 'http://www.rsd.com.cn/', 'http://www.supor.com.cn/', 'http://www.fotile.com/', 'http://www.cnsuning.com/', 'http://www.sharp.cn/', 'http://www.galanz.com.cn/', 'http://www.chinamacro.cn/', 'http://www.robam.com/', 'http://www.gome.com.cn/', 'http://www.joyoung.com.cn/', 'http://www.staccato.com/', 'http://www.meiling.com/', 'http://www.fushibao.com/', 'http://www.sacon.cn/', 'http://www.yongle.com.cn/', 'http://www.xinfei.com/']

    begin = time.time()
    number = 0
    def show(c):
        global number
        number +=1
        print number, "cost time", time.time() - begin
        print c.url
        content = c.content.getvalue()
        pos = content.find('href="http://')
        print content[pos:pos+200]
        if pos!=-1:
            print "find url",content[pos:content.find('"',pos+13)]
    link = set(link)
    print "total link", len(link)
    for i in link:
        urlopen(i, show)
Line 265: Line 265:
    print "cost time", time.time() - begin

Line 270: Line 273:

代码见 http://zspy.googlecode.com

张沈鹏 [email protected] http://zsp.javaeye.com/

2008-1-23 16:42

1. PycURL

Pycurl http://pycurl.sourceforge.net/

外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.

http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.

参考文献1,测试代码

   1 #像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些
   2 import StringIO
   3 
   4 html = StringIO.StringIO()
   5 
   6 import pycurl
   7 c = pycurl.Curl()
   8 
   9 c.setopt(pycurl.URL, 'http://www.baidu.com')
  10 
  11 #写的回调
  12 c.setopt(pycurl.WRITEFUNCTION, html.write)
  13 
  14 c.setopt(pycurl.FOLLOWLOCATION, 1)
  15 
  16 #最大重定向次数,可以预防重定向陷阱
  17 c.setopt(pycurl.MAXREDIRS, 5)
  18 
  19 #访问,阻塞到访问结束
  20 c.perform()
  21 
  22 #打印出 200(HTTP状态码) http://www.baidu.com(生效的url)
  23 print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)
  24 
  25 #输出百度首页的html
  26 #print html.getvalue()

然后看看多线程,http://pycurl.cvs.sourceforge.net/pycurl/pycurl/tests/ 有很多例子,还可做参考http://pycurl.sourceforge.net/doc/curlmultiobject.html

我自己改写了一个:)

   1 #!/usr/bin/env python
   2 #coding=utf-8
   3 
   4 import threading
   5 import pycurl
   6 from cStringIO import StringIO
   7 
   8 class UrlOpen(threading.Thread):
   9     """异步下载网页"""
  10 
  11     def __init__(self):
  12         super(UrlOpen,self).__init__()
  13         self.opener = pycurl.CurlMulti()
  14         self.handle_list=[]
  15 
  16     def add(self,url,recall,writer=StringIO()):
  17         """
  18         参数:网址,回调函数,存放临时数据的对象
  19         """
  20         c = pycurl.Curl()
  21 
  22         #可以传给回调函数
  23         c.url=url
  24         c.content = writer
  25         c.recall = recall
  26         c.setopt(c.URL,url)
  27         c.setopt(c.WRITEFUNCTION,c.content.write)
  28 
  29         self.handle_list.append(c)
  30         self.opener.add_handle(c)
  31 
  32     def _remove(self,c):
  33         c.close()
  34         self.opener.remove_handle(c)
  35         self.handle_list.remove(c)
  36 
  37 
  38     def run(self):
  39         num_handle=len(self.handle_list)
  40         while 1:
  41             ret = self.opener.select(10.0)
  42             if ret == -1:  continue
  43             while 1:
  44                 num_handle_pre=num_handle
  45                 ret, num_handle =self.opener.perform()
  46                 #活动的连接数改变时
  47                 if num_handle!=num_handle_pre:
  48                     result=self.opener.info_read()
  49                     print result
  50                     for i in result[1]:
  51                         #成功
  52                         i.http_code = i.getinfo(i.HTTP_CODE)
  53                         self._remove(i)
  54                         i.recall(i)
  55                     for i in result[2]:
  56                         #失败,应该记录一下
  57                         self._remove(i)
  58 
  59                 if ret != pycurl.E_CALL_MULTI_PERFORM:
  60                     break
  61 
  62 _opener=None
  63 def urlopen(*arg,**key):
  64     global _opener
  65     if _opener is None:
  66         _opener=UrlOpen()
  67         _opener.add(*arg,**key)
  68         _opener.start()
  69     else:
  70         _opener.add(*arg,**key)
  71 
  72 def show(x):
  73     print x.content.getvalue()
  74 if __name__=="__main__":
  75     urlopen("http://www.baidu.com/",show)
  76     _opener.join()

又封装了一个异步打开网页的类和函数

zuroc@aragorn ~/wgetbot/zget $ cat test.py 
#coding=utf-8

import threading
from cStringIO import StringIO
import pycurl
"""
Asyn open url
Author:[email protected]
2008-1-25 17:14
"""

class UrlOpen(threading.Thread):
    """异步下载网页"""

    def __init__(self, ):
        super(UrlOpen, self).__init__()
        self.opener = pycurl.CurlMulti()
        self.handle_list = []
        self.waiting = []

    def add(self, url, recall, catch=None, writer=StringIO):
        """
        参数:网址,回调函数,存放临时数据的对象
        """
        if catch is None:
            def catch(curl, error_no, desp):
                print "Url:%s\nError:%s - %s"%(curl.url, error_no, desp)

        c = pycurl.Curl()

        #可以传给回调函数
        c.url = url
        c.content = writer()
        c.recall = recall
        c.catch = catch
        c.setopt(c.URL,
            url.encode('utf-8') if type(url) is unicode else url
        )
        c.setopt(c.WRITEFUNCTION, c.content.write)
        c.setopt(pycurl.CONNECTTIMEOUT, 30)
        c.setopt(pycurl.MAXREDIRS, 3)
        c.setopt(pycurl.TIMEOUT, 300)
        c.setopt(pycurl.FOLLOWLOCATION, 1)

        self.waiting.append(c)

    def _add(self):
        waiting = self.waiting[:]
        self.waiting = []
        for c in waiting:
            self.handle_list.append(c)
            self.opener.add_handle(c)

    def _remove(self, c):
        c.close()
        self.opener.remove_handle(c)
        self.handle_list.remove(c)
        del c

    def run(self):
        import select
        import time
        num_handle = 0
        while 1:
            #print 1
            if self.handle_list:
                #print "select start"
                ret = self.opener.select(1.0)
                #print "select end"
                if ret >= 0:
                    while 1:
                        #print "perform start"
                        num_handle_pre = num_handle
                        ret, num_handle = self.opener.perform()
                        #print "preform end"
                        #活动的连接数改变时
                        if num_handle != num_handle_pre:
                            result = self.opener.info_read()
                            for i in result[1]:
                                #成功
                                i.http_code = i.getinfo(i.HTTP_CODE)
                                i.recall(i)
                                self._remove(i)
                            for i in result[2]:
                                #失败,应该记录一下,或回调失败函数
                                #i为(<pycurl.Curl object at 0x00C04C80>, 6, 'Could not resolve host: www.msn.com (Domain name not found)')
                                i[0].catch(*i)
                                self._remove(i[0])
                        if ret != pycurl.E_CALL_MULTI_PERFORM:
                            #print "break"
                            break
            else:
                #print "sleep"
                time.sleep(1)
                #print "sleep end"

            self._add()

_opener = None
def urlopen(*arg, **key):
    global _opener
    if _opener is None:
        _opener = UrlOpen()
        _opener.start()
    _opener.add(*arg, **key)
import time
if __name__ == "__main__":
    link = ['http://www.baidu.com/', 'http://www.sina.com.cn', 'http://www.qq.com', 'http://www.sohu.com', 'http://www.163.com/', 'http://www.ifeng.com/', 'http://www.cctv.com/default.shtml', 'http://www.xinhuanet.com/', 'http://www.people.com.cn/', 'http://cn.msn.com/', 'http://www.google.cn/', 'http://cn.yahoo.com/', 'http://www.amazon.cn/?source=2009hao123famousdaohang', 'http://www.chinamobile.com/', 'http://www.pconline.com.cn/', 'http://www.chinahr.com/', 'http://www.gov.cn/', 'http://www.zhcw.com/', 'http://www.autohome.com.cn/', 'http://www.zhaopin.com/Market/hao123.jsp', 'http://fund.eastmoney.com/', 'http://www.eastmoney.com/', 'http://www.xiaonei.com/', 'http://www.soufun.com/', 'http://www.51.com/', 'http://www.rayli.com.cn/', 'http://youa.baidu.com/', 'http://www.360.cn/', 'http://www.ctrip.com/', 'http://www.xcar.com.cn/', 'http://www.qq163.com', 'http://www.samsung.com/', 'http://www.zol.com.cn/', 'http://www.taobao.com/', 'http://www.icbc.com.cn/', 'http://www.sto.cn', 'http://www.dianping.com', 'http://www.gougou.com', 'http://www.ct10000.com', 'http://www.anjuke.com/?&amp;pi=H-1', 'http://www.360buy.com/union/union_default.asp?union_Id=75', 'http://tl.sohu.com/?rcc_id=061f93406c7a77d6a6e4c8647b09fb56', 'http://www.51job.com/default.php?code=gb2312', 'http://central.dangdang.com/league/leagueref.asp?from=P-227107a&amp;backurl=http://a.oadz.com/link/C/51/52648/ZzZIg.TXwwIV69FJbh3yJe4H7WI_/a/898?home.dangdang.com', 'http://www.jiayuan.com/st/?id=3237&amp;url=http://www.jiayuan.com']
    link +=['http://www.qidian.com/', 'http://www.readnovel.com/', 'http://www.hongxiu.com/', 'http://www.bookge.com/', 'http://www.jjwxc.net/', 'http://hjsm.tom.com/', 'http://www.4yt.net/', 'http://www.cuiweiju.com/', 'http://book.sina.com.cn/', 'http://www.xxsy.net/', 'http://www.wansong.net/', 'http://www.myfreshnet.com/', 'http://www.fmx.cn/', 'http://www.xs8.cn/', 'http://www.rongshuxia.com/', 'http://www.booksky.org/', 'http://www.zhulang.com/', 'http://www.3320.net/', 'http://www.17k.com/', 'http://www.xhsd.net/', 'http://www.qukanshu.com/', 'http://www.fbook.net/', 'http://www.duyidu.com/', 'http://www.soso999.com/', 'http://www.junzitang.com/', 'http://msn.hongxiu.com/', 'http://www.yuanwen.com/', 'http://top.baidu.com/book.html', 'http://www.lcread.com/', 'http://www.sodu.com.cn/', 'http://www.cc222.com/', 'http://www.feiku.com/', 'http://book.hqdoor.com/', 'http://book.sooyuu.com/', 'http://www.52eshu.com/', 'http://bbs.91txt.com/', 'http://book.qq.com/', 'http://book.sohu.com/', 'http://www.baidu.com/search/guoxue/dir/fenlei.html', 'http://wind.yinsha.com/', 'http://www.duzhe.com/', 'http://www.storychina.cn/', 'http://www.shigeku.org/', 'http://www.goodmood.cn/', 'http://www.nlc.gov.cn/', 'http://www.qnwz.cn/', 'http://wenxue.xilu.com/']
    link +=['http://www.ganji.com/', 'http://www.58.com/', 'http://www.baixing.com/', 'http://www.263.com/', 'http://www.kuxun.cn/', 'http://www.mangocity.com/', 'http://www.qunar.com/', 'http://www.dianping.com/', 'http://www.fantong.com/', 'http://www.55bbs.com/', 'http://www.19lou.com/', 'http://www.koubei.com/', 'http://www.nike.com.cn/', 'http://www.li-ning.com.cn/', 'http://www.bosideng.com/', 'http://www.pirateship.com.cn/', 'http://www.goelia.com.cn/', 'http://www.adidas.com/', 'http://www.converse.com.cn/', 'http://www.romon.com/index.php', 'http://www.youngor.com/', 'http://www.etam.com.cn', 'http://www.heilanhome.com/', 'http://www.mizuno.com.cn/', 'http://www.goldlion-china.com/', 'http://www.phland.com.cn/', 'http://www.betu.com.hk/', 'http://www.puma.com.cn/', 'http://www.anta.com/', 'http://www.pierrecardin.com.cn/', 'http://www.bobdog.com.cn/', 'http://www.idaphne.com/', 'http://www.e-giordano.com/', 'http://www.361sport.com/', 'http://www.levi.com.cn/', 'http://www.lee.com.cn/', 'http://www.shanshan.com/', 'http://www.semir.com', 'http://www.versace.com/flash.html', 'http://www.k-boxing.com/', 'http://only.nzn.cn/', 'http://www.pb89.com/%20', 'http://www.aimer.com.cn/', 'http://www.balenciaga.com', 'http://www.ordifen.com.cn/', 'http://www.ochirly.com/', 'http://www.uggaustralia.com/', 'http://www.jshyx.com/', 'http://www.givenchy.com/default.php', 'http://www.thenorthface.com.cn/', 'http://www.tissot.com.hk/', 'http://www.azona.com.hk/', 'http://www.3suisses.com.cn/', 'http://www.valentino.it/', 'http://www.yishion.com.cn/', 'http://www.chowtaiseng.com/', 'http://www.tsljewellery.com/', 'http://www.jeanswest.com/', 'http://www.baoxiniao.com.cn/', 'http://www.qsyr.com/%20', 'http://www.septwolves.com/', 'http://www.baleno.com.hk/', 'http://www.belle.com.cn/', 'http://www.teenmix.com.cn/', 'http://www.fairwhale.com.cn/', 'http://www.swatch.com.cn/', 'http://www.staccato.com/', 'http://www.daphne.com.cn/', 'http://www.c-banner.com/', 'http://www.xtep.com.cn/', 'http://www1.jeanswest.com.cn/', 'http://www.kappa.com.cn/', 'http://www.laofengxiang.com/', 'http://www.cnhqt.com/', 'http://www.tatashoes.com.cn/', 'http://www.robinhood.com.cn/', 'http://www.doublestar.com.cn/', 'http://www.ozarkgear.com.cn/', 'http://www.aokang.com.cn/', 'http://www.ctf.com.cn/', 'http://www.crpttan.com/', 'http://www.calvinklein.com/', 'http://www.citizen.com.cn/', 'http://www.longines.com/', 'http://www.jackjonescn.net/', 'http://www.famoustone.com/', 'http://www.kfc.com.cn/', 'http://www.bjyoshinoya.com.cn/', 'http://www.starbucks.cn/', 'http://www.icoke.cn/', 'http://www.mengniu.com.cn/', 'http://www.mcdonalds.com.cn/', 'http://www.yonghe.com.cn/', 'http://www.ubccn.com/', 'http://www.dicos.com.cn/', 'http://www.yili.com/', 'http://www.pizzahut.com.cn/', 'http://www.quanjude.com.cn/direct.php', 'http://www.nescafe.com.cn/', 'http://www.masterkong.com.cn/', 'http://www.heinz.com.cn/', 'http://www.origus.com/', 'http://www.xfy.com.cn/', 'http://www.haagendazs.com.cn/', 'http://www.wyeth.com.cn/', 'http://www.moutaichina.com/index.asp', 'http://www.tsingtao.com.cn/', 'http://www.meadjohnson.com.cn/', 'http://www.dumex.com.cn/', 'http://www.wuliangye.com.cn/', 'http://www.zkungfu.com/', 'http://www.dovechocolate.com.cn/', 'http://www.ganso.com.cn/%20%20%20', 'http://www.beingmate.com/', 'http://www.waffleboy.com.cn/', 'http://www.holiland.com.cn/', 'http://www.goldenjaguar.com/', 'http://www.huiyuan.com.cn/%20%20%20', 'http://www.hsufuchifoods.com/%20%20%20%20', 'http://www.maybellinechina.com/', 'http://www.dabao.com/', 'http://www.lorealchina.com/', 'http://www.shiseidochina.com/', 'http://www.esteelauder.com.cn/', 'http://www.avon.com.cn/PRSuite/home/home.jsp', 'http://www.tjoy.biz/', 'http://www.lancome.com.cn/_zh/_cn/index.aspx', 'http://www.kose.co.jp/', 'http://www.h2oplus.com.hk/', 'http://www.yuesai.com.cn/', 'http://www.nivea.com.cn/', 'http://www.chanel.com/', 'http://www.clinique.com.cn/index.tmpl?ngextredir=1', 'http://www.ponds.com.cn/', 'http://www.vichy.com.cn/', 'http://www.efu.org.cn/', 'http://www.laneigechina.com/Front-Page/index2.jsp', 'http://www.olay.com.cn/', 'http://www.guerlain.com.cn/', 'http://www.aupres-shiseido.com.cn/', 'http://www.dior.com/pcd/International/JSP/Home/prehomeFlash.jsp', 'http://www.herborist.com.cn/', 'http://www.dhc.net.cn/', 'http://www.ysl.com/', 'http://www.kose.com.cn/', 'http://www.liangfei.com/', 'http://www.tayoi.com/', 'http://www.chcedo.com/', 'http://www.head-shoulders.com.cn/', 'http://www.slek.com.cn/', 'http://www.mentholatum.com.cn/', 'http://www.pg.com.cn/', 'http://www.china-ajjj.com/', 'http://www.rejoice.com.cn/', 'http://www.cnnice.com/', 'http://www.watsons.com.cn/', 'http://www.unilever.com.cn/', 'http://www.ikea.com/cn/zh/', 'http://www.pantene.com.cn/', 'http://www.colgate.com.cn/app/Colgate/CN/HomePage.cvsp', 'http://www.auchan.com.cn/', 'http://www.c-bons.com.cn/', 'http://www.carrefour.com.cn/', 'http://www.vs.com.cn/', 'http://www.crest.com.cn/', 'http://www.tongrentang.com/', 'http://www.amway.com.cn/index.aspx', 'http://www.wal-martchina.com/', 'http://www.tupperware.com.cn/', 'http://www.ourlotus.com/', 'http://www.skyworth.com/cn/', 'http://www.sony.com.cn/', 'http://www.siemens.com.cn/', 'http://www.gree.com.cn/', 'http://www.shinco.com/', 'http://www.midea.com.cn/', 'http://www.samsung.com.cn/', 'http://www.hitachi-shha.com.cn/', 'http://www.electrolux.com.cn/', 'http://www.toshiba.com.cn/', 'http://www.panasonic.com.cn/', 'http://www.canon.com.cn/', 'http://www.tcl.com/', 'http://www.lg.com.cn/', 'http://cn.changhong.com/', 'http://www.haier.com/', 'http://www.philips.com.cn/', 'http://www.konka.com/', 'http://www.rsd.com.cn/', 'http://www.supor.com.cn/', 'http://www.fotile.com/', 'http://www.cnsuning.com/', 'http://www.sharp.cn/', 'http://www.galanz.com.cn/', 'http://www.chinamacro.cn/', 'http://www.robam.com/', 'http://www.gome.com.cn/', 'http://www.joyoung.com.cn/', 'http://www.staccato.com/', 'http://www.meiling.com/', 'http://www.fushibao.com/', 'http://www.sacon.cn/', 'http://www.yongle.com.cn/', 'http://www.xinfei.com/']

    begin = time.time()
    number = 0
    def show(c):
        global number
        number +=1
        print number, "cost time", time.time() - begin
        print c.url
        content = c.content.getvalue()
        pos = content.find('href="http://')
        print content[pos:pos+200]
        if pos!=-1:
            print "find url",content[pos:content.find('"',pos+13)]
    link = set(link)
    print "total link", len(link)
    for i in link:
        urlopen(i, show)
    _opener.join()
    print "cost time", time.time() - begin

1.1. 相关文献

2. 反馈

PageComment2

zspy (last edited 2009-12-25 07:15:17 by localhost)