爬虫小结 2016-03-26 # 爬虫小结 本文整理自[上个版本](http://rockyblog.cn/blog/37/) 代码是很早之前的,有些不美观或不能运行,但思想总是一样的. [TOC] ## 超级简单版 适用于静态网页,没有太多反扒机制的网页。来个栗子,批量下载妹子图。现在可能不能直接用,要稍微改下,这是很久前写的。用到的就是最最基本知识。当然有点心急,这是多线程。 ```python #-*- coding:utf-8 -*- ''' Created on 2015年5月25日 获取图片 使用步骤 依赖 BeautifulSoup ''' import uuid import urllib from bs4 import BeautifulSoup import threading import os dist = r"f:\\photo\\" if not os.path.exists(dist): os.mkdir(dist) def parse_first(): """解析首页""" url ="http://www.meizitu.com/" html = urllib.urlopen(url).read() soup = BeautifulSoup(html) soup.find_all('a',target="_blank") alist =soup.find('div',class_='tags').find_all('a') hrefs = [row.get('href') for row in alist] return hrefs def parse_page(soup): """获得分页链接""" div=soup.find('div',id="wp_page_numbers") page_list=[] if div: alist= div.find_all('a') page_list = [row.get('href') for row in alist] page_list = ["http://www.meizitu.com/a/"+url for url in page_list] return page_list def page_subpage(soup): """获得详细页链接""" page_list = [] div = soup.find_all('div',class_="pic") for row in div: a = row.find('a') href = a.get('href') page_list.append(href) return page_list def getimage(all_list): """解析一个页面""" all_list = list(set(all_list)) for url in all_list: html = urllib.urlopen(url) soup = BeautifulSoup(html) div = soup.find_all('div', id="picture")[0] imgs = div.find_all('img') srcs = [row.get('src') for row in imgs] for src in srcs: name= dist+threading.currentThread().getName()+uuid.uuid4().hex+'.jpg' print name urllib.urlretrieve(src,name) def handle_subpage(first_urls): """逻辑控制""" all_page_list = [] for url in first_urls: html = urllib.urlopen(url) soup = BeautifulSoup(html) page_list = parse_page(soup) all_page_list.extend(page_list) print all_page_list for url in all_page_list : try: html = urllib.urlopen(url) except Exception, e: print e.getMessage() continue soup = BeautifulSoup(html) ditali_list = page_subpage(soup) if ditali_list: ditali_list = list(set(ditali_list)) getimage(ditali_list) threading.current_thread().stop() if __name__=='__main__': first_list =parse_first() print len(first_list) perlen = len(first_list)/4+1 threads=[] for i in range(4): if i==3: subs = first_list[3*perlen:len(first_list)] else: subs = first_list[i:(i+1)*perlen] th = threading.Thread(target=handle_subpage,name="thread"+str(i),args=(subs,)) threads.append(th) if threads: for thr in threads: thr.start() ``` ### 稍微复杂版 复杂的地方在于需要分析网站的请求的来龙去脉,这部分需要对http 协议有所了解,还需要比较熟悉Chorme 或Firefox 等一些浏览器的调试工具的使用.下面是一个世纪佳缘的爬虫举个例子 ``` rom pymongo import MongoClient import requests import json import model import util import Queue import copy import random import time class Spider(): def __init__(self): self.headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 Chrome/43.0.2357.130 Safari/537.36', } #self.headers['Cookie']=cookies self.post_data = { 'sex': 'f', 'key': '', 'stc': '1:11,2:18.28,3:155.170,23:1', 'sn': 'default', 'sv': 1, 'p': 1, 'f': '', 'listStyle': 'bigPhoto', 'pri_uid': 0, 'jsversion': 'v5' } self.citys = Queue.Queue() citys = util.getcitys() for city in citys: self.citys.put(city) self.save_file = util.FileUtile() self.starturl = 'http://search.jiayuan.com/v2/search_v2.php' self.session = requests.Session() self.session.headers = self.headers collient = MongoClient() db = collient['jiayuan'] self.collection = db['meizi'] def getresource(self): totalpage = 1000 repeatpages = 0 for page in xrange(1,totalpage+1): if repeatpages>=2: return False self.post_data['p']=page print self.post_data response = self.session.post(self.starturl,data=self.post_data) result = response.content result = result.replace('##jiayser##','').replace('//','').strip() json_result = json.loads(result) if totalpage==1000: totalpage = json_result.get('pageTotal') print json.dumps(json_result,ensure_ascii=False) usersinfo = json_result.get('userInfo') indbs = 0 for row in usersinfo: temp_meizi = model.Meizi() temp_meizi._id=row.get('realUid') print temp_meizi._id temp_meizi.infodict = row detailurl = 'http://www.jiayuan.com/%d?fxly=search_v2'%(temp_meizi._id) print detailurl print self.collection.find({'_id':temp_meizi._id}).count() if self.collection.find({'_id':temp_meizi._id}).count()>0: indbs+=1 if indbs>=20: repeatpages+=1 continue indbs = 0 repeatpages =0 self.collection.save(temp_meizi.todict()) detilhtml = requests.get(detailurl) self.save_file.save(detilhtml.text,str(temp_meizi._id)) def handle(self): print self.citys.qsize() while not self.citys.empty(): for startage in range(18,29): endage = startage+2 city = self.citys.get() self.post_data['stc'] = '1:%d,2:%d.%d,3:155.170,23:1' % (city,startage,endage) print self.post_data try: self.getresource() except Exception,e: print e continue if __name__=='__main__': spider = Spider() spider.handle() ``` ### 更复杂版 有时候对方网站会设置一些很难获得的Cookie, 比如检查你浏览器窗口大小之类的,就需要有浏览器的存在,对已Python 我使用过的两个一个[Ghos.pyt](https://github.com/jeanphix/Ghost.py),一个是[Selenium](http://selenium-python.readthedocs.org/) #### Ghost.py ```python #!/usr/bin/env python #encoding=utf-8 from ghost import Ghost from ghost import TimeoutError from bs4 import BeautifulSoup from pymongo import MongoClient page= None resourses = None class Spider(): def __init__(self): self.starturl = 'http://search.zhenai.com/search/getfastmdata.jsps' self.headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 Chrome/43.0.2357.130 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' } self.gh = Ghost(wait_timeout=30,download_images=False) self._openurl() client = MongoClient() db = client['jiayuan'] self.collect = db['zhenids'] self.provinces =['10102000', '10103000', '10101002', '10101201', '10105000', '10104000', '10101000', '10118000', '10131000', '10127000', '10107000', '10124000', '10115000', '10112000', '10125000', '10121000', '10120000', '10117000', '10114000', '10106000', '10119000', '10113000', '10116000', '10109000', '10111000', '10110000', '10130000', '10128000', '10126000', '10108000', '10123000', '10122000', '10129000',] def _setcitys(self,province,city): """设置城市参数""" page,resourses = self.gh.set_field_value('input[id="regworkProvince"]',city) page,resourses = self.gh.set_field_value('input[id="regworkCity"]',city) page,resourses = self.gh.set_field_value('input[id="areaFormWorkProvince"]',province) page,resourses = self.gh.set_field_value('input[id="areaForm_workCity"]',city) def _setforms(self,agebegin='18',ageend='28'): agebegin = str(agebegin) ageend = str(ageend) page,resourses = self.gh.fill('#hsmore',{'agebegin':agebegin,'ageend':ageend,'gender':'1'}) page,resourses = self.gh.set_field_value('#age1_tmp',agebegin) page,resourses = self.gh.set_field_value('#age2_tmp',ageend) def _openurl(self): page,resourses = self.gh.open(self.starturl,method='post',timeout=200,headers=self.headers) def _commit_selform(self): """提交选择表单""" page,resourses = self.gh.call("#hsmore", "submit", expect_loading=True) self.gh.wait_for_page_loaded() def _gotopage(self,page): """翻页""" page,resourses = self.gh.evaluate('window.gotopage(%d);'%page,expect_loading=True) print len(resourses) self.gh.wait_for_page_loaded() def _get_userid(self): soup = BeautifulSoup(self.gh.content) ul = soup.find('ul',class_='search_user_photo clearfix') lis = ul.find_all('li') las = [li.find('a',target='_blank') for li in lis] hrefs = [a.get('href') for a in las] print hrefs for href in hrefs: if self.collect.find({'_id':href}).count()>0: continue self.collect.save(({'_id':href,'isGet':0})) def parse_page(self): """解析网页""" for prov in self.provinces: self._setcitys(prov,'-1') self._setforms() try: self._commit_selform() except TimeoutError,e: print e continue pagesoup = BeautifulSoup(self.gh.content) citys = pagesoup.find('div',class_='city_box') als = citys.find_all('a') citys = [a.get('v') for a in als] for city in citys: self._setcitys(prov,city) for agebegin in range(18,29): for ageend in range(18,29): self._setforms(agebegin,ageend) self._commit_selform() self._get_userid() for page in range(2,50): try: self._gotopage(page) self._get_userid() except TimeoutError,e: print e except AttributeError,e: print e break ``` #### Selenium版 ```python #!/usr/bin/env python #encoding=utf-8 from selenium import webdriver from bs4 import BeautifulSoup from pymongo import MongoClient import re class Spider(): def __init__(self): self.webdriver = webdriver.Firefox(timeout=60) self.webdriver.get('http://search.zhenai.com/search/getfastmdata.jsps') element = self.webdriver.find_element_by_id('sex') element.send_keys(1) self.provinces =['10102000', '10103000', '10101002', '10101201', '10105000', '10104000', '10101000', '10118000', '10131000', '10127000', '10107000', '10124000', '10115000', '10112000', '10125000', '10121000', '10120000', '10117000', '10114000', '10106000', '10119000', '10113000', '10116000', '10109000', '10111000', '10110000', '10130000', '10128000', '10126000', '10108000', '10123000', '10122000', '10129000',] client = MongoClient() db = client['jiayuan'] self.collect = db['zhenids'] def click_detail_search(self): button = self.webdriver.find_element_by_xpath('//a[@class="btn_orange_L"]') button.click() def click_down_detail(self): down_element = self.webdriver.find_element_by_id('searchSlideDown') down_element.click() def parse_page(self): for province in self.provinces: self.click_down_detail() pro_element = self.webdriver.find_element_by_id('areaFormWorkProvince') pro_element.send_keys(province) button = self.webdriver.find_element_by_xpath('//a[@class="btn_search_c"]') button.click() print "province:%s"%province pagesoup = BeautifulSoup(self.webdriver.page_source) citys = pagesoup.find('div',class_='city_box') als = citys.find_all('a') citys = [a.get('v') for a in als] for city in citys: print ("city:%s"%city) self.click_down_detail() pelement = self.webdriver.find_element_by_id('regworkProvince_tmp') pelement.send_keys(province) p2element = self.webdriver.find_element_by_id('regworkProvince') p2element.send_keys(province) c_element = self.webdriver.find_element_by_id('regworkCity') c_element.send_keys(city) self.click_detail_search() for beginage in range(18,29): self.click_down_detail() begin_el = self.webdriver.find_element_by_id('age1') begin_el.send_keys(beginage) self.click_detail_search() for enage in range(18,29): self.click_down_detail() end_el = self.webdriver.find_element_by_id('age2') end_el.send_keys(enage) self.click_detail_search() soup = BeautifulSoup(self.webdriver.page_source) div_a = soup.find('div',id='setpage') if not div_a: continue all_a = div_a.find_all('a') all_num = [int(re.search('\d+',row.get('href')).group()) for row in all_a] total_page = max(all_num) print total_page for page_num in range(1,total_page): try: page_element = self.webdriver.find_element_by_xpath('//a[text()=%d]'%page_num) page_element.click() self._get_userid() if page_num==50: break except Exception,e: print e break def _get_userid(self): soup = BeautifulSoup(self.webdriver.page_source) ul = soup.find('ul',class_='search_user_photo clearfix') lis = ul.find_all('li') las = [li.find('a',target='_blank') for li in lis] hrefs = [a.get('href') for a in las] loops = 0 print hrefs for href in hrefs: if self.collect.find({'_id':href}).count()>0: loops+=1 if loops==20: raise Exception('allin db') continue self.collect.save(({'_id':href,'isGet':0})) ``` ### js方案 除了Python 有时候用js 会有比较好的优势,而且有Phantomjs ,Nodejs, Casperjs 比较多的选择,个人感觉灵活性会有很大的缺失,这方面是不能跟Python 比的. ``` var casper = require('casper').create({ "User-Agent":'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36', logLevel: "info", verbose: true }); casper.start('http://search.zhenai.com/search/getfastmdata.jsps',function(){ this.fill('form#hsmore',{ 'gender':1 },true); }); casper.then(function() { this.evaluateOrDie(function() { return /message sent/.test(document.body.innerText); }, 'sending message failed'); }); casper.run(function() { this.echo('message sent').exit(); ```