百度知道爬虫

工具:

pyspider

数据库:

mongodb

思路:

  1. 假设你要根据两个关键字搜索百度知道答案,比如:”购物“和”价格“;
  2. 组建爬虫的url,需要把这两个关键字转化为url编码的格式,url编码教程详见
  3. 取出搜索页面列表上面所有项的url链接;
  4. 然后,爬取步骤3的url,取出页面上面的question和最佳答案;
  5. 循环往复,进行2、3、4步骤;

代码:

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

from pyspider.libs.base_handler import *
from urllib.parse import quote, unquote
from pymongo import MongoClient
import datetime
import time
import random
client = MongoClient("自定义数据库接口")
db = client.自定义数据库名


class Handler(BaseHandler):
    crawl_config = {
    }
    
    key_word1 = quote("自定义关键字1".encode("GB2312"))
    key_word2_list = ["自定义关键字2"]
    key_word2_list = [quote(i.encode("GB2312")) for i in key_word2_list]
    url_format = "https://zhidao.baidu.com/search?word={}&ie=gbk&site=-1&sites=0&date=0&pn={}"
    page_num = 76 # 最大页码
    start_page = 0 # 开始的页码
    max_random = 5 # 随机数的最大值
    
    headers1 = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 
        "Accept-Encoding": "gzip, deflate, br", 
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
        "Connection": "keep-alive", 
        "Host": "zhidao.baidu.com",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "自定义系统的浏览器版本号",
    }
    fetch_count = 0 # 访问api计数
    
    @every(minutes=24 * 60)
    def on_start(self):
        start_time = time.time()
        for kw2 in Handler.key_word2_list:
            kw = Handler.key_word1 + "+" + kw2
            for p in range(Handler.start_page, Handler.page_num):
                url = Handler.url_format.format(kw, p*10)
                Handler.headers1["Referer"] = url
                Handler.fetch_count += 1  # 抓取次数累计
                start_time = start_time+Handler.fetch_count*random.randint(0, Handler.max_random)
                
                self.crawl(url, callback=self.index_page, headers=Handler.headers1, save={"url": url, "kw2": unquote(kw2, "GB2312"), "start_time": start_time}, exetime=start_time)

    @config(age=24 * 60 * 60)
    def index_page(self, response):
        start_time = response.save["start_time"]
        for i in response.doc("#wgt-list > dl").items():
            url = i("dt > a").attr("href")
            Handler.headers1["Referer"] = response.save["url"]
            
            Handler.fetch_count += 1  # 抓取次数累计
            start_time = start_time+Handler.fetch_count*random.randint(0, Handler.max_random)
            
            self.crawl(url, callback=self.detail_page, headers=Handler.headers1, save={"kw2": response.save["kw2"]}, exetime=start_time)

    @config(priority=2)
    def detail_page(self, response):
        data = {}
        data["url"] = response.url
        data["question"] = response.doc("#wgt-ask > h1").text().strip()
        data["answer"] = response.doc("div.best-text").text().strip().replace("\n", "").replace("展开全部", "")
        if not data["answer"]:
            data["answer"] = response.doc("div.answer-text").text().strip().replace("\n", "").replace("展开全部", "")
        data["kw2"] = response.save["kw2"]
        data["kw1"] = unquote(Handler.key_word1, "GB2312")
        data["crawl_time"] = datetime.datetime.strptime(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
        print(data)
        if db["自定义表名"].find({"kw2": data["kw2"], "question": data["question"]}).count() == 0:
            result = db["自定义表名"].insert(data)
        else:
            result = db["自定义表名"].update(data)
        print(result)