知乎的反爬虫机制越来越严了,一不小心就被封号,只好使用代理IP,延长休息时间,好不容易爬了4000多条数据,到现在还是被封的。
爬虫主要注意以下内容
使用session模拟登陆
爬取策略,爬取每一个人的关注的人作为我们下一次爬取的目标
使用requests访问,用cokie模拟登陆
使用lxml,xpath解析页面
使用mongodb存储数据
使用redis做为任务队列
crawler.py
import requests from lxml import html from db import Zhihu_User_Profile from red_filter import check_url, re_crawl_url import random import time from bs4 import BeautifulSoup class SpiderProxy(object): “””黄哥Python培训 黄哥所写 Python版本为2.7以上””” headers = { “Host”: “www.xicidaili.com”, “User-Agent”: “Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:47.0) Gecko/20100101 Firefox/47.0”, “Accept”: “text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8”, “Accept-Language”: “en-US,en;q=0.5”, “Accept-Encoding”: “gzip, deflate”, “Referer”: “http://www.xicidaili.com/wt/1”, } def __init__(self, session_url): self.req = requests.session() self.req.get(session_url) def get_pagesource(self, url): html = self.req.get(url, headers=self.headers) return html.content def get_all_proxy(self, url, n): data = [] for i in range(1, n): html = self.get_pagesource(url + str(i)) soup = BeautifulSoup(html, “lxml”) table = soup.find(‘table’, id=”ip_list”) print(type(table)) for row in table.findAll(“tr”): cells = row.findAll(“td”) tmp = [] for item in cells: tmp.append(item.find(text=True)) data.append(tmp[1:3]) return data class Zhihu_Crawler(): ”’ basic crawler ”’ def __init__(self,url,option=”print_data_out”): ”’ initialize the crawler ”’ self.option=option self.url=url self.header={} self.header[“User-Agent”]=”Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:43.0) Gecko/20100101 Firefox/43.0″ # self.header[“Host”]=”www.zhihu.com” self.header[“Referer”]=”www.zhihu.com” #cookie self.cookies={“q_c1″:”a57da482190149c598ea41970ed39289|1472689490000|1472689490000”, “_xsrf”:”bfdb84f58409e2f2b028b44a3e902d0b”, “d_c0″:”AABAwkDceAqPTpwAGa9drbEiAVyxwnKIdLk=|1472689493″, ” _zap”:”56de3379-1be2-4fd4-8e32-e78e16c2d2e5″, “_za”:”01619ad1-d465-4e22-89a4-8ac36b669864″, “a_t”:'”2.0AADAlWcvAAAXAAAAB1vzVwAAwJVnLwAAAIDALVtgcQoXAAAAYQJVTdqD51cA5BWpP3Ia7DK9D31FNsVyKP043vAvISraCDclEqORCO9pgoL7d4uCQA==”‘, “z_c0″:”Mi4wQUFEQWxXY3ZBQUFBZ01BdFcyQnhDaGNBQUFCaEFsVk4yb1BuVndEa0Zha19jaHJzTXIwUGZVVTJ4WElvX1RqZThB|1472974343|dd5d7646eb4742626b9b40fd542bfc3dae0cb33b”, } def send_request(self): ”’ send a request to get HTML source ”’ added_followee_url = self.url.decode(“utf-8”) + “/followees” temp=random.randint(0,10) if temp<5: print("sleeping") time.sleep(3) session_url = 'http://www.xicidaili.com/wt/1' url = 'http://www.xicidaili.com/wt/' #p = SpiderProxy(session_url) proxy_ip ={ "202.43.147.226", "117.166.183.111", "125.33.207.59", "218.94.149.147", "113.5.211.198", "110.73.8.230", "183.254.228.144", "111.1.3.36", "114.232.70.191", "183.224.99.151", "114.113.126.32" } try: r = requests.get(added_followee_url, cookies=self.cookies, headers=self.header, verify=False) except: re_crawl_url(self.url) print("here1") return content = r.text #print(content) if r.status_code == 200: self.parse_user_profile(content) def process_xpath_source(self, source): if source: #print(source[0]) return source[0] else: return '' def parse_user_profile(self, html_source): ''' parse the user's profile to mongo ''' # initialize variances self.user_name = '' self.fuser_gender = '' self.user_location = '' self.user_followees = '' self.user_followers = '' self.user_be_agreed = '' self.user_be_thanked = '' self.user_education_school = '' self.user_education_subject = '' self.user_employment = '' self.user_employment_extra = '' self.user_info = '' self.user_intro = '' tree = html.fromstring(html_source) # parse the html via lxml self.user_name = self.process_xpath_source(tree.xpath("//a[@class='name']/text()")) self.user_location = self.process_xpath_source(tree.xpath("//span[@class='location item']/@title")) self.user_gender = self.process_xpath_source(tree.xpath("//span[@class='item gender']/i/@class")) if "female" in self.user_gender and self.user_gender: self.user_gender = "female" else: self.user_gender = "male" self.user_employment = self.process_xpath_source(tree.xpath("//span[@class='employment item']/@title")) self.user_employment_extra = self.process_xpath_source(tree.xpath("//span[@class='position item']/@title")) self.user_education_school = self.process_xpath_source(tree.xpath("//span[@class='education item']/@title")) self.user_education_subject = self.process_xpath_source( tree.xpath("//span[@class='education-extra item']/@title")) try: self.user_followees = tree.xpath("//div[@class='zu-main-sidebar']//strong")[0].text self.user_followers = tree.xpath("//div[@class='zu-main-sidebar']//strong")[1].text except: return self.user_be_agreed = self.process_xpath_source( tree.xpath("//span[@class='zm-profile-header-user-agree']/strong/text()")) self.user_be_thanked = self.process_xpath_source( tree.xpath("//span[@class='zm-profile-header-user-thanks']/strong/text()")) self.user_info = self.process_xpath_source(tree.xpath("//span[@class='bio']/@title")) self.user_intro = self.process_xpath_source(tree.xpath("//span[@class='content']/text()")) if self.option == "print_data_out": self.print_data_out() else: self.store_data_to_mongo() # find the follower's url #print("here2") url_list = tree.xpath("//h2[@class='zm-list-content-title']/span/a/@href") #print(url_list) for target_url in url_list: #print(target_url) target_url = target_url.replace("https", "http") check_url(target_url) def print_data_out(self): ''' print out the user data ''' print("*" * 60) print('用户名:%s ' % self.user_name) print("用户性别:%s " % self.user_gender) print('用户地址:%s ' % self.user_location) print("被同意:%s " % self.user_be_agreed) print("被感谢:%s " % self.user_be_thanked) print("被关注:%s " % self.user_followers) print("关注了:%s " % self.user_followees) print("工作:%s/%s" % (self.user_employment, self.user_employment_extra)) print("教育:%s/%s" % (self.user_education_school, self.user_education_subject)) print("用户信息:%s" % self.user_info) print("*" * 60) def store_data_to_mongo(self): ''' store the data in mongo ''' new_profile = Zhihu_User_Profile( user_name=self.user_name, user_be_agreed=self.user_be_agreed, user_be_thanked=self.user_be_thanked, user_followees=self.user_followees, user_followers=self.user_followers, user_education_school=self.user_education_school, user_education_subject=self.user_education_subject, user_employment=self.user_employment, user_employment_extra=self.user_employment_extra, user_location=self.user_location, user_gender=self.user_gender, user_info=self.user_info, user_intro=self.user_intro, user_url=self.url ) new_profile.save() print("saved:%s " % self.user_name) 多进程和携程爬取 import gevent.monkey gevent.monkey.patch_all() import sys import gevent import redis import crawler import time from multiprocessing.dummy import Pool import multiprocessing from red_filter import red, red_queue #red_queue="test_the_url_queue" #red_crawled_set="test_url_has_crawled" process_pool=Pool(multiprocessing.cpu_count()*2) #connect to redis server #wrap the class method def create_new_slave(url,option): new_slave=crawler.Zhihu_Crawler(url,option) new_slave.send_request() return "ok" def gevent_worker(option): while True: url=red.lpop(red_queue) if not url: break create_new_slave(url,option) def process_worker(option): jobs=[] for i in range(2): jobs.append(gevent.spawn(gevent_worker,option)) gevent.joinall() if __name__=="__main__": ''' start the crawler ''' start=time.time() count=0 #choose the running way of using database or not try: option=sys.argv[1] except: option='' if "mongo" not in option: option="print_data_out" #the start page red.lpush(red_queue,"https://www.zhihu.com/people/mo-ming-42-91") url=red.lpop(red_queue) create_new_slave(url,option=option) for i in range(2): gevent_worker(option=option) process_pool.map_async(process_worker,option) process_pool.close() process_pool.join() print("crawler has crawled %d people ,it cost %s" % (count,time.time()-start)) 数据分析 我们先从大学用户数据开始看起: 我们可以看到: 清北和武汉大学雄踞前三个,武汉大学多可能因为我关注的武大的比较多的原因。 接着,很明显前面的大学都是综合性大学,说明越是有多的文科专业的越有知乎上的求知欲 接着,我们看看知乎上面女性大学分布 华科的女生为什么那么多,看来数据量还是太小了,不具有代表性 接下来,可以看一看北京大学专业分布 看来经济类专业的人最喜欢上知乎,遥遥领先 知乎公司分布,可能有点不太准确 用户关系 被关注数(粉丝数)分布: 所以可以看到,知乎是一堆大v带着玩的社区 还有的就是知乎的性别分布了,居然不是1:1!!!!!! 这个可能是因为我关注了轮子哥的缘故 使用的脚本代码如下 # -*- coding: utf-8 -*- # encoding:utf-8 import sys #sys.setdefaultencoding("utf-8") import pandas as pd import matplotlib.pylab as plt import seaborn as sns import numpy as np from matplotlib.font_manager import FontProperties data = pd.read_csv("output.csv") font=FontProperties(fname=r'/usr/share/fonts/truetype/字体管家扁黑体.ttf',size=14) ''' sr=data['user_education_school'].value_counts()[:30] fig = plt.figure() ax = fig.add_subplot(111) ax.set_title("知乎大学分布",fontproperties=font) rects1 = ax.barh(range(len(sr.values)),sr.values[::-1], align = 'center',alpha=0.6,color="purple") ax.set_ylim(-1,30) ax.set_yticks(range(len(sr.values))) ax.set_yticklabels(sr.index[::-1], rotation=0 ,fontproperties=font) ax.set_xlim(0,max(sr.values)*1.1) plt.show() ''' ''' sr=data[data['user_gender']=='female']['user_education_school'].value_counts()[:30] fig = plt.figure() ax = fig.add_subplot(111) ax.set_title("知乎女性大学分布",fontproperties=font) rects1 = ax.barh(range(len(sr.values)),sr.values[::-1], align = 'center',alpha=0.6,color="purple") ax.set_ylim(-1,30) ax.set_yticks(range(len(sr.values))) ax.set_yticklabels(sr.index[::-1], rotation=0 ,fontproperties=font) ax.set_xlim(0,max(sr.values)*1.1) plt.show() ''' ''' sr=data[data['user_education_school']=='北京大学']['user_education_subject'].value_counts()[:30] fig = plt.figure() ax = fig.add_subplot(111) ax.set_title("知乎北京大学专业分布",fontproperties=font) rects1 = ax.barh(range(len(sr.values)),sr.values[::-1], align = 'center',alpha=0.6,color="purple") ax.set_ylim(-1,30) ax.set_yticks(range(len(sr.values))) ax.set_yticklabels(sr.index[::-1], rotation=0 ,fontproperties=font) ax.set_xlim(0,max(sr.values)*1.1) plt.show() ''' ''' sr=data['user_employment'].value_counts()[:30] fig = plt.figure() ax = fig.add_subplot(111) ax.set_title("知乎公司分布",fontproperties=font) rects1 = ax.barh(range(len(sr.values)),sr.values[::-1], align = 'center',alpha=0.6,color="purple") ax.set_ylim(-1,30) ax.set_yticks(range(len(sr.values))) ax.set_yticklabels(sr.index[::-1], rotation=0,fontproperties=font) ax.set_xlim(0,max(sr.values)*1.1) plt.show() ''' ''' plt.plot(data[data['user_followers']>50][‘user_followers’].order(),np.arange(len(data[data[‘user_followers’]>50])),color=’blue’,alpha=0.6) plt.scatter(data[data[‘user_followers’]>50][‘user_followers’].order(),np.arange(len(data[data[‘user_followers’]>50])),color=’purple’,alpha=0.6) plt.title(“知乎被关注者数(>50)分布”,fontproperties=font) plt.show() ”’ sr=data[‘user_gender’].value_counts() fig = plt.figure() ax = fig.add_subplot(111) ax.set_title(“知乎性别分布”,fontproperties=font) rects1 = ax.barh(range(len(sr.values)),sr.values, align = ‘center’,alpha=0.4,color=”red”) ax.set_ylim(-1,2) ax.set_yticks(range(len(sr.values))) ax.set_yticklabels(sr.index[::-1], rotation=0 ,fontproperties=font) ax.set_xlim(0,max(sr.values)*1.1) plt.show() ”’ sr=data[((data[‘专业’]==’计算机科学’)|(data[‘专业’]==’软件工程’)|(data[‘专业’]==’计算机’)|(data[‘专业’]==’计算机科学与技术’)|(data[‘专业’]==’通信工程’)|(data[‘专业’]==’计算机科学与技术’))][‘公司’].value_counts()[:30] fig = plt.figure() ax = fig.add_subplot(111) ax.set_title(“知乎计算机相关专业公司分布”) rects1 = ax.barh(range(len(sr.values)),sr.values[::-1], align = ‘center’,alpha=0.6,color=”purple”) ax.set_ylim(-1,len(sr)) ax.set_yticks(range(len(sr.values))) ax.set_yticklabels(sr.index[::-1], rotation=0 ) ax.set_xlim(0,max(sr.values)*1.1) plt.show() ”’ ”’ sr=data[((data[‘专业’]==’金融’)|(data[‘专业’]==’经济学’)|(data[‘专业’]==’计算机’)|(data[‘专业’]==’金融学’)|(data[‘专业’]==’市场营销’)|(data[‘专业’]==’会计’))][‘公司’].value_counts()[:30] fig = plt.figure() ax = fig.add_subplot(111) ax.set_title(“知乎金融相关专业公司分布”) rects1 = ax.barh(range(len(sr.values)),sr.values[::-1], align = ‘center’,alpha=0.6,color=”purple”) ax.set_ylim(-1,len(sr)) ax.set_yticks(range(len(sr.values))) ax.set_yticklabels(sr.index[::-1], rotation=0 ) ax.set_xlim(0,max(sr.values)*1.1) plt.show() ”’ ”’ sr=data[((data[‘专业’]==’计算机科学’)|(data[‘专业’]==’软件工程’)|(data[‘专业’]==’计算机’)|(data[‘专业’]==’计算机科学与技术’)|(data[‘专业’]==’通信工程’)|(data[‘专业’]==’计算机科学与技术’))][‘所在地’].value_counts()[:30] fig = plt.figure() ax = fig.add_subplot(111) ax.set_title(“知乎计算机相关专业所在地分布”) rects1 = ax.barh(range(len(sr.values)),sr.values[::-1], align = ‘center’,alpha=0.6,color=”purple”) ax.set_ylim(-1,len(sr)) ax.set_yticks(range(len(sr.values))) ax.set_yticklabels(sr.index[::-1], rotation=0 ) ax.set_xlim(0,max(sr.values)*1.1) plt.show() ”’ ”’ sr=data[((data[‘专业’]==’金融’)|(data[‘专业’]==’经济学’)|(data[‘专业’]==’计算机’)|(data[‘专业’]==’金融学’)|(data[‘专业’]==’市场营销’)|(data[‘专业’]==’会计’))][‘所在地’].value_counts()[:30] fig = plt.figure() ax = fig.add_subplot(111) ax.set_title(“知乎金融相关专业所在地分布”) rects1 = ax.barh(range(len(sr.values)),sr.values[::-1], align = ‘center’,alpha=0.6,color=”purple”) ax.set_ylim(-1,len(sr)) ax.set_yticks(range(len(sr.values))) ax.set_yticklabels(sr.index[::-1], rotation=0 ) ax.set_xlim(0,max(sr.values)*1.1) plt.show() ”’ ”’ sr=data[data[‘所在地’]==’北京’][‘公司’].value_counts()[:30] fig = plt.figure() ax = fig.add_subplot(111) ax.set_title(“知乎北京公司分布”) rects1 = ax.barh(range(len(sr.values)),sr.values[::-1], align = ‘center’,alpha=0.6,color=”purple”) ax.set_ylim(-1,len(sr)) ax.set_yticks(range(len(sr.values))) ax.set_yticklabels(sr.index[::-1], rotation=0 ) ax.set_xlim(0,max(sr.values)*1.1) plt.show() ”’ ”’ sr = data[data[‘user_location’] == ‘上海’][‘user_employment_extra’].value_counts()[:30] fig = plt.figure() ax = fig.add_subplot(111) ax.set_title(“知乎上海职位分布”,fontproperties=font) rects1 = ax.barh(range(len(sr.values)), sr.values[::-1], align=’center’, alpha=0.6, color=”purple”) ax.set_ylim(-1, len(sr)) ax.set_yticks(range(len(sr.values))) ax.set_yticklabels(sr.index[::-1], rotation=0,fontproperties=font) ax.set_xlim(0, max(sr.values) * 1.1) plt.show() ”’
源代码:知乎爬虫
参考链接
知乎用户信息爬虫
知乎用户图谱