百度知道爬虫
工具:
pyspider
数据库:
mongodb
思路:
- 假设你要根据两个关键字搜索百度知道答案,比如:”购物“和”价格“;
- 组建爬虫的url,需要把这两个关键字转化为url编码的格式,url编码教程详见;
- 取出搜索页面列表上面所有项的url链接;
- 然后,爬取步骤3的url,取出页面上面的question和最佳答案;
- 循环往复,进行2、3、4步骤;
代码:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from pyspider.libs.base_handler import *
from urllib.parse import quote, unquote
from pymongo import MongoClient
import datetime
import time
import random
client = MongoClient("自定义数据库接口")
db = client.自定义数据库名
class Handler(BaseHandler):
crawl_config = {
}
key_word1 = quote("自定义关键字1".encode("GB2312"))
key_word2_list = ["自定义关键字2"]
key_word2_list = [quote(i.encode("GB2312")) for i in key_word2_list]
url_format = "https://zhidao.baidu.com/search?word={}&ie=gbk&site=-1&sites=0&date=0&pn={}"
page_num = 76 # 最大页码
start_page = 0 # 开始的页码
max_random = 5 # 随机数的最大值
headers1 = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"Host": "zhidao.baidu.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "自定义系统的浏览器版本号",
}
fetch_count = 0 # 访问api计数
@every(minutes=24 * 60)
def on_start(self):
start_time = time.time()
for kw2 in Handler.key_word2_list:
kw = Handler.key_word1 + "+" + kw2
for p in range(Handler.start_page, Handler.page_num):
url = Handler.url_format.format(kw, p*10)
Handler.headers1["Referer"] = url
Handler.fetch_count += 1 # 抓取次数累计
start_time = start_time+Handler.fetch_count*random.randint(0, Handler.max_random)
self.crawl(url, callback=self.index_page, headers=Handler.headers1, save={"url": url, "kw2": unquote(kw2, "GB2312"), "start_time": start_time}, exetime=start_time)
@config(age=24 * 60 * 60)
def index_page(self, response):
start_time = response.save["start_time"]
for i in response.doc("#wgt-list > dl").items():
url = i("dt > a").attr("href")
Handler.headers1["Referer"] = response.save["url"]
Handler.fetch_count += 1 # 抓取次数累计
start_time = start_time+Handler.fetch_count*random.randint(0, Handler.max_random)
self.crawl(url, callback=self.detail_page, headers=Handler.headers1, save={"kw2": response.save["kw2"]}, exetime=start_time)
@config(priority=2)
def detail_page(self, response):
data = {}
data["url"] = response.url
data["question"] = response.doc("#wgt-ask > h1").text().strip()
data["answer"] = response.doc("div.best-text").text().strip().replace("\n", "").replace("展开全部", "")
if not data["answer"]:
data["answer"] = response.doc("div.answer-text").text().strip().replace("\n", "").replace("展开全部", "")
data["kw2"] = response.save["kw2"]
data["kw1"] = unquote(Handler.key_word1, "GB2312")
data["crawl_time"] = datetime.datetime.strptime(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
print(data)
if db["自定义表名"].find({"kw2": data["kw2"], "question": data["question"]}).count() == 0:
result = db["自定义表名"].insert(data)
else:
result = db["自定义表名"].update(data)
print(result)