##language:zh #pragma section-numbers on ''' 爬书虫 ''' ::-- ZoomQuiet [<>] <> ## 默许导航,请保留 <> = 起点爬虫 = {{{朱丹 hide details 3:42 pm (9 minutes ago) reply-to python-cn@googlegroups.com to python-cn@googlegroups.com date Aug 6, 2007 3:42 PM subject [CPyUG:30059] 给爱看小说的朋友_起点小说爬虫 mailed-by googlegroups.com }}} 最近闹书荒,不知道看什么书好. 起点上推荐的,排行榜上靠前的烂书真不少,不能用那个来判定. 而且下的时候麻烦死了,点好多下 我下txt格式(手机上用)的,,经常下不下来. 好不容易下下来了,还贼难看... 下下来的书名还是一堆id数字,还要自己去修改书名.....麻烦.. 干脆用python写了一个程序,下的那叫一个爽.. 稀里哗啦下了200M的书,慢慢看了..这个爬虫也是一顺的就写完了,没做什么修改,也懒得去改了,能用就行. 下的时候需要提供你要下的页面的url,比如http://www.cmfu.com 会找到这个页面上所有的书来下载 {{{#!python #@+leo-ver=4-thin-encoding=gb2312,. #@+node:BIGZHU.20070731160918:@thin d:/bigzhu/python/python_project/get_cmfu.py #@+at #@nonl # 起点小说爬虫 #@-at #@@c #@@language python #@+others #@+node:BIGZHU.20070731161308:import import httplib,urllib2,urllib,cookielib,re,threading import os #@nonl #@-node:BIGZHU.20070731161308:import #@+node:BIGZHU.20070731160928:getCookie def getCookie(): cj = cookielib.CookieJar()#建立Cookie实例 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))#建立opener与Cookie关联 return opener #@-node:BIGZHU.20070731160928:getCookie #@-others #@<> #@+node:BIGZHU.20070731160918.1:<> def getBookIdList(opener,urlList): BookIdList = [] for i in urlList: url=i print url request = urllib2.Request(url) cmfu = opener.open(request).read() #cmfuURL = re.findall("> #@nl #@<> #@+node:BIGZHU.20070731164705:<> def getBookName(opener,bookId=''): if bookId == '': print "传入BookIdList是空的" bookURL = 'http://www.cmfu.com/readbook.asp?bl_id=%s'%bookId request = urllib2.Request(bookURL) bookPage = opener.open(request).read() opener.close() bookname = re.findall('bookname=\S{1,}',bookPage) bookname = [re.sub("bookname=",'',k) for k in bookname] bookname = [re.sub('"','',k) for k in bookname][0] return bookname #@-node:BIGZHU.20070731164705:<> #@nl #@<> #@+node: BIGZHU.20070731171721:<> def getTextFile(opener,bookId): bookName = getBookName(opener,bookId) #判断文件是否已经存在 if os.path.isfile(os.getcwd()+"\\起点\\%s.txt"%bookName): print "%s 已经存在"%bookName else: url = 'http://download.cmfu.com/pda/%s.txt'%bookId try: bookData = opener.open(url).read() except : print "2 %s"%bookName try: bookData = opener.open(url).read() except : print "last try %s"%bookName try: bookData = opener.open(url).read() except : print "end try %s"%bookName opener.close() f=open(os.getcwd()+"\\起点\\%s.txt"%bookName,"wb") f.write(bookData) f.close() print 'get book %s 完毕'%bookName #@-node:BIGZHU.20070731171721:<> #@nl #@<> #@+node:BIGZHU.20070801172939:<> class runGetFile(threading.Thread): def __init__(self,bookId): threading.Thread.__init__(self) self.bookId = bookId #self.opener = opener def run(self): opener = getCookie() getTextFile(opener,self.bookId) #@nonl #@-node: BIGZHU.20070801172939:<> #@nl #@<> #@+node:BIGZHU.20070802171013:<> class ProcessURL: """对新输入url,save 到ini中 对已有url,忽视 每次使用,自动读取ini的url,提供使用""" def __init__(self): pass #@ <> #@+node:BIGZHU.20070802171013.1:<> def saveURL(self,urlList=[]): '''存储新的url到URL.ini中''' try: f=open(os.getcwd()+"\\起点\\URL.ini","wb")#追加内容 except IOError: print "文件打开错误" #格式化成字符串 s_urlList = ";".join(urlList) f.write(s_urlList) f.close() #@-node:BIGZHU.20070802171013.1:<> #@nl #@ <> #@+node:BIGZHU.20070802171013.2:<> def getURLIni(self): """读取 URL.ini中的url 返回一个URL list""" #判断目录是否存在 if os.path.exists (os.getcwd()+"\\起点"): pass else: print "创建目录 \起点" os.mkdir("起点") iniData='' if os.path.isfile(os.getcwd ()+"\\起点\\URL.ini"): f=open(os.getcwd()+"\\起点\\URL.ini","rb") iniData = f.read() f.close() else: print "URL.txt不存在,创建之" f=open(os.getcwd()+"\\起点\\URL.ini","wb") #iniData = f.read() f.close() return iniData.split(";")#格式化成list #@-node:BIGZHU.20070802171013.2: <> #@nl #@-node:BIGZHU.20070802171013:<> #@nl #@<
> #@+node:BIGZHU.20070731164705.1:<
> if __name__ == '__main__': opener = getCookie() #urlList =["http://www.cmfu.com/index.asp"," http://www.cmfu.com/listbookqb.asp?pageid=2007-8-1%2012:26&status=down","http://www.cmfu.com/listbookqb.asp?pageid=2007-7-31%2023:03&status=down ","http://www.cmfu.com/index_wxxx.asp"] #存放和读取url urlType = ProcessURL() urlList = urlType.getURLIni() saveIni = 0 # 标识是否有url 更新 while True: url = raw_input("要截取的起点的某个页面: ") if url=='': break if url in urlList: print "%s 已有,忽视之"%url else: urlList.extend([url]) print "%s 是新的,添加之"%url saveIni =1 #url = 'http://www.cmfu.com/index.asp' bookIdList=getBookIdList(opener,urlList) for i in bookIdList: thread = runGetFile(i) thread.start() #存储到ini中 if saveIni == 1: urlType.saveURL(urlList) #@-node:BIGZHU.20070731164705.1:<
> #@nl #@nonl #@-node:BIGZHU.20070731160918:@thin d:/bigzhu/python/python_project/get_cmfu.py #@-leo }}} == 反馈 ==