scrapy爬小說程序(mongodb版)的完善
一、背景:原程序爬取小說要求一次成功,否則,必須從頭再來,影響爬取效率。
二、完善思路
(1)增加對已爬取內容的檢索,若mongodb已有內容,則不再爬取。
(2)增加對總爬取時間的計時。
三、代碼
(1)xbiquge/pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os
import time
from twisted.enterprise import adbapi
from pymongo import MongoClient
class XbiqugePipeline(object):
conn = MongoClient('mongodb://admin:admin@localhost:27017/admin')
db = conn.novels #建立數據庫novels的連接對象db
name_novel = ''
url_firstchapter = ''
name_txt = ''
start_time=time.time()
#定義類初始化動作
def __init__(self):
return
#爬蟲開始
def open_spider(self, spider):
return
def get_collection(self,name_collection): #獲取數據集cursor對象
myset = self.db[name_collection]
return myset
def process_item(self, item, spider):
#if self.name_novel == '':
self.name_novel = item['name']
self.url_firstchapter = item['url_firstchapter']
self.name_txt = item['name_txt']
myset = self.db[self.name_novel]
myset.insert_one(dict(item))
# if self.name_novel != '':
# exec('self.db.'+ self.name_novel + '.insert_one(dict(item))')
return item
#從數據庫取小說章節內容寫入txt文件
def content2txt(self,dbname,firsturl,txtname):
myset = self.db[dbname]
record_num = myset.find().count() #獲取小說章節數量
print("小說總章節數:",record_num)
counts=record_num
url_c = firsturl
start_time=time.time() #獲取提取小說內容程序運行的起始時間
f = open(txtname+".txt", mode='w', encoding='utf-8') #寫方式打開小說名稱加txt組成的文件
for i in range(counts): #括號中為counts
#-----------使用count()方法獲得的返回整型值作為是否獲得數據的判斷依據-------------
# record_m_count=myset.find({"url": url_c},{"content":1,"_id":0}).count()
# if record_m_count == 0:
# print("數據集中沒有找到章節內容。\n出錯url:",url_c)
# break
#--------------------------------------------------------------------------------
#-----------使用next()方法讀取迭代器數據,并使用try except捕獲未獲得數據的錯誤-----
try:
record_m=myset.find({"url": url_c},{"content":1,"_id":0}).next()
#except Exception as e:
except StopIteration:
print("數據集中沒有獲得章節內容。\n出錯url:",url_c)
break #跳出for循環,終止小說文件生成
#--------------------------------------------------------------------------------
record_content_c2a0 = ''
#------------使用for循環讀取迭代器數據模式---------------------------------
# record_i = myset.find({"url": url_c},{"content":1,"_id":0})
# for record_m in record_i:
# record_content_c2a0 = record_m["content"] #獲取小說章節內容
#---------------------------------------------------------------------------
record_content_c2a0 = record_m["content"]
#record_content=record_content_c2a0.replace(u'\xa0', u'') #消除特殊字符\xc2\xa0
record_content=record_content_c2a0
#print(record_content)
f.write('\n')
f.write(record_content + '\n')
f.write('\n\n')
url_ct = myset.find({"url": url_c},{"next_page":1,"_id":0}) #獲取下一章鏈接的查詢對象
for item_url in url_ct:
url_c = item_url["next_page"] #下一章鏈接地址賦值給url_c,準備下一次循環。
#print("下一頁",url_c)
f.close()
print("文件生成用時:",time.time()-start_time)
print("小說爬取總用時:",time.time()-self.start_time)
print(txtname + ".txt" + " 文件已生成!")
return
#爬蟲結束,調用content2txt方法,生成txt文件
def close_spider(self,spider):
if self.name_novel !='' and self.url_firstchapter != '' and self.name_txt != '':
self.content2txt(self.name_novel,self.url_firstchapter,self.name_txt)
return
(2)爬蟲示例代碼xbiquge/spiders/sancun.py
# -*- coding: utf-8 -*-
import scrapy
from xbiquge.items import XbiqugeItem
from xbiquge.pipelines import XbiqugePipeline
import pdb
class SancunSpider(scrapy.Spider):
name = 'sancun'
allowed_domains = ['www.xbiquge.la']
#start_urls = ['https://www.xbiquge.la/10/10489/']
url_ori= "https://www.xbiquge.la"
url_firstchapter = "https://www.xbiquge.la/10/10489/4534454.html"
name_txt = "./novels/三寸人間"
index_FS = url_firstchapter.rfind('/') #從右到左定位第一個正斜杠的位置
#url_chapters = url_firstchapter[0:32] #截取字符串包括尾部的正斜杠
url_chapters = url_firstchapter[0:index_FS+1] #截取目錄頁面字符串,包括尾部的正斜杠
pipeline=XbiqugePipeline()
novelcollection=pipeline.get_collection(name) #獲取小說數據集cursor對象,mongodb的數據集(collection)相當于mysql的數據表table
#--------------------------------------------
#如果next_page的值是小說目錄頁面url,則把包含目錄頁面的記錄刪除,以免再次抓取時,出現多個目錄頁面url,使得無法獲得最新內容。
if novelcollection.find({"next_page":url_chapters}).count() != 0 :
print("包含目錄頁面url的記錄:",novelcollection.find({"next_page":url_chapters},{"_id":0,"id":1,"url":1,"next_page":1}).next())
# pdb.set_trace()
novelcollection.remove({"next_page":url_chapters})
print("已刪除包含目錄頁面url的記錄。")
#--------------------------------------------
novelcounts=novelcollection.find().count()
novelurls=novelcollection.find({},{"_id":0,"id":1,"url":1})
item = XbiqugeItem()
item['id'] = novelcounts #id置初值為colletion的記錄總數
item['name'] = name
item['url_firstchapter'] = url_firstchapter
item['name_txt'] = name_txt
def start_requests(self):
start_urls = [self.url_chapters]
print("小說目錄url:",start_urls)
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): #網頁提取數據,并與mongodb數據集比較,沒有相同的數據才從網頁抓取。
f = open("/root/xbiquge_w/url_list.txt","w") #打開文件,以便寫入抓取頁面url
count_bingo=0 #數據集中已有記錄的條數
dl = response.css('#list dl dd') #提取章節鏈接相關信息
for dd in dl:
count_iterator = 0
self.url_c = self.url_ori + dd.css('a::attr(href)').extract()[0] #組合形成小說的各章節鏈接
#print("網頁提取url:", self.url_c)
self.novelurls=self.novelcollection.find({},{"_id":0,"id":1,"url":1}) #通過重新賦值迭代器來重置迭代器指針,使for循環能夠
從頭遍歷迭代器。
for url in self.novelurls:
#print("mongodb提取url:", url)
if url["url"]==self.url_c: #如果數據集中找到與網頁提取的url值相同,則跳出循環
count_bingo += 1
count_iterator += 1
break
if count_iterator != 0 : #如果有命中結果,則繼續下一個循環,不執行爬取動作
continue
#print("爬取url:",self.url_c)
f.write("爬取url:"+self.url_c+"\n")
#yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True)
yield scrapy.Request(self.url_c, callback=self.parse_c) #以生成器模式(yield)調用parse_c方法獲得各章節鏈接、上一頁鏈接
、下一頁鏈接和章節內容信息。
#print(self.url_c)
f.close()
print("數據集已有記錄數count_bingo:",count_bingo)
def parse_c(self, response):
self.item['id'] += 1
self.item['url'] = response.url
self.item['preview_page'] = self.url_ori + response.css('div .bottem1 a::attr(href)').extract()[1]
self.item['next_page'] = self.url_ori + response.css('div .bottem1 a::attr(href)').extract()[3]
title = response.css('.con_top::text').extract()[4]
contents = response.css('#content::text').extract()
text=''
for content in contents:
text = text + content
#print(text)
self.item['content'] = title + "\n" + text.replace('\15', '\n') #各章節標題和內容組合成content數據,\15是^M的八進制表示,>需要替換為換行符。
yield self.item #以生成器模式(yield)輸出Item對象的內容給pipelines模塊。
if self.item['url'][self.url_firstchapter.rfind('/')+1:self.url_firstchapter.rfind('.')] == self.item['next_page'][self.url_firstchapter.rfind('/')+1:self.url_firstchapter.rfind('.')]: #同一章有分頁的處理
self.url_c = self.item['next_page']
yield scrapy.Request(self.url_c, callback=self.parse_c)

浙公網安備 33010602011771號