PythonTip >> 博文 >> python

共享一个以前用python写的爬虫脚本 - 放逐的记忆

zihua 2014-01-20 23:01:58 点击: 702 | 收藏


当时下班后没网络,出于对某些感兴趣的网站的关注就写了个脚本白天把网页爬下来晚上回去看...囧~可能还有很多需要改进的地方(比如没有模拟真实浏览器去访问,若爬我自己的博客就被CloudFlare拦了下来),感兴趣的童鞋自己改吧^_^

__author__ = 'jj' import urllib2, re, os,sys,time,hashlib import traceback import threading import Queue rootDir="F:/tmp/winyay.com" firstPage="http://t.winyay.com/" prefix=["http://t.winyay.com/"] downloadThreadCnt=4 childUrlsQueue=Queue.Queue(10000) downloadQueue=Queue.Queue(10000) mutex=threading.RLock() stopSig=False class DownloadThread(threading.Thread): def __init__(self,depth=3): threading.Thread.__init__(self) self.depth=depth self.level,self.url,self.filePath=0,None,None self.timeout=60 #then exit condition when no urls to download & parse but currentLevel less than depth, def download(self): try: import socket socket.setdefaulttimeout(120) fp = urllib2.urlopen(self.url) fDir=os.path.dirname(self.filePath) if not os.path.exists(fDir): os.makedirs(fDir) op = open(self.filePath, 'wb') try: op.write(fp.read()) finally: op.close() except Exception, e: print e def parseUrls(self): newUrls=[] if os.path.exists(self.filePath): op=open(self.filePath,'rb+') try: try: content=op.read() newUrls=re.findall(r'href=[\'"]([^#]+?)[\'"]',content) newUrls.extend(re.findall(r'src=[\'"](.+?)[\'"]',content)) newUrls.extend(re.findall(r'url\([\'"](.+?)[\'"]\)',content)) for url in newUrls: if len([p for p in prefix if not re.search(p,url)])==len(prefix): continue fPath=genLocalFilePath(url) if fPath and re.search(rootDir+"/*mce_markerquot;,fPath) is None: content=content.replace(url+'"',fPath+'"') content=content.replace(url+"'",fPath+"'") op.seek(0) op.write(content) except Exception, e: print traceback.format_exc() finally: op.close() return newUrls def run(self): global downloadThreadCnt,stopSig timeoutSig=self.timeout while True: try: self.level,self.url,self.filePath=downloadQueue.get_nowait() except Queue.Empty: #mutex.acquire() #print ' '.join((self.getName(),self.level,self.url,self.filePath)) #mutex.release() if self.level>self.depth: break time.sleep(1) timeoutSig-=1 if not timeoutSig: print ' '.join(('no urls to parse,',self.getName(),'exit. ')) break continue timeoutSig=self.timeout print self.url+' ---> '+self.filePath self.download() childUrls=self.parseUrls() childUrlsQueue.put((self.level,self.url,childUrls)) mutex.acquire() downloadThreadCnt-=1 if not downloadThreadCnt: print 'all parse threads exit, stop main thread.' stopSig=True mutex.release() def genLocalFilePath(url): tmp=re.sub(r'[a-zA-Z]+://((\w+\.)+(\w{1,6}))',rootDir,url.strip()) ts=str(tmp).split('?') if len(ts)>1: tmp=re.sub(r'\?.+

 

,'_'+hashlib.md5("".join(ts[1:])).hexdigest()+'.html',tmp) else: tmp=re.sub(r'\?.+

 

,'',tmp) tmp=re.sub(r'/nul/','/nul1/',tmp) tmp=re.sub(r'/aux/','/aux1/',tmp) tmp=re.sub(r'/con/','/con1/',tmp) tmp=re.sub(r'/com1/','/com11/',tmp) tmp=re.sub(r'/lpt1/','/lpt11/',tmp) tmp=re.sub(r'\\|\*|\?|"|<|>|\|','',tmp) tmp=re.sub(r'/+

 

,'/',tmp) if tmp.endswith('/'): tmp+='default-index-page.html' elif rootDir==tmp: tmp+='/default-index-page.html' return tmp def spider(): historyUrls=[] childUrlsQueue.put((0,firstPage,[firstPage])) while not stopSig: level,parentUrl,urls=None,None,None try: level,parentUrl,urls=childUrlsQueue.get_nowait() except Queue.Empty: time.sleep(0.2) continue #print urls for url in urls: if not re.search(r'^[a-zA-Z]+://.+?/?',url): url='/'.join(parentUrl.split('/')[:-1]+[url]) if len([p for p in prefix if not re.search(p,url)])==len(prefix): print 'skip, unkown url:',url continue if url in historyUrls: continue filePath=genLocalFilePath(url) if filePath: if os.path.exists(filePath): print 'skip, exist file:',filePath continue historyUrls.append(url) downloadQueue.put((level+1,url,filePath)) if __name__=='__main__': threading.Thread(target=spider).start() for i in range(0,downloadThreadCnt): DownloadThread().start()

原文链接:http://www.wumii.com/item/5GoPNxIb

作者:zihua | 分类: python | 标签: python | 阅读: 702 | 发布于: 2014-01-20 23时 |