scrapy爬小說程序（mongodb版）的完善

一、背景：原程序爬取小說要求一次成功，否則，必須從頭再來，影響爬取效率。

二、完善思路

（1）增加對已爬取內容的檢索，若mongodb已有內容，則不再爬取。

（2）增加對總爬取時間的計時。

三、代碼

（1）xbiquge/pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os
import time
from twisted.enterprise import adbapi
from pymongo import MongoClient

class XbiqugePipeline(object):
    conn = MongoClient('mongodb://admin:admin@localhost:27017/admin')
    db = conn.novels #建立數據庫novels的連接對象db
    name_novel = ''
    url_firstchapter = ''
    name_txt = ''
    start_time=time.time()

    #定義類初始化動作
    def __init__(self):

        return

    #爬蟲開始
    def open_spider(self, spider):

        return

    def get_collection(self,name_collection):  #獲取數據集cursor對象
        myset = self.db[name_collection]
        return myset

    def process_item(self, item, spider):
        #if self.name_novel == '':
        self.name_novel = item['name']
        self.url_firstchapter = item['url_firstchapter']
        self.name_txt = item['name_txt']
        myset = self.db[self.name_novel]
        myset.insert_one(dict(item))
#        if self.name_novel != '':
#            exec('self.db.'+ self.name_novel + '.insert_one(dict(item))')
        return item

    #從數據庫取小說章節內容寫入txt文件
    def content2txt(self,dbname,firsturl,txtname):
        myset = self.db[dbname]
        record_num = myset.find().count() #獲取小說章節數量
        print("小說總章節數:",record_num)
        counts=record_num
        url_c = firsturl
        start_time=time.time()  #獲取提取小說內容程序運行的起始時間
        f = open(txtname+".txt", mode='w', encoding='utf-8')   #寫方式打開小說名稱加txt組成的文件
        for i in range(counts):  #括號中為counts
#-----------使用count()方法獲得的返回整型值作為是否獲得數據的判斷依據-------------
#            record_m_count=myset.find({"url": url_c},{"content":1,"_id":0}).count()
#            if record_m_count == 0:
#               print("數據集中沒有找到章節內容。\n出錯url:",url_c)
#               break
#--------------------------------------------------------------------------------

#-----------使用next()方法讀取迭代器數據，并使用try except捕獲未獲得數據的錯誤-----
            try:
                record_m=myset.find({"url": url_c},{"content":1,"_id":0}).next()
            #except Exception as e:
            except StopIteration:
                print("數據集中沒有獲得章節內容。\n出錯url:",url_c)
                break   #跳出for循環，終止小說文件生成
#--------------------------------------------------------------------------------
            record_content_c2a0 = ''

#------------使用for循環讀取迭代器數據模式---------------------------------
#            record_i = myset.find({"url": url_c},{"content":1,"_id":0})
#            for record_m in record_i:
#                record_content_c2a0 = record_m["content"]  #獲取小說章節內容
#---------------------------------------------------------------------------
            record_content_c2a0 = record_m["content"]

            #record_content=record_content_c2a0.replace(u'\xa0', u'')  #消除特殊字符\xc2\xa0
            record_content=record_content_c2a0
            #print(record_content)
            f.write('\n')
            f.write(record_content + '\n')
            f.write('\n\n')
            url_ct = myset.find({"url": url_c},{"next_page":1,"_id":0})  #獲取下一章鏈接的查詢對象
            for item_url in url_ct:
                url_c = item_url["next_page"]  #下一章鏈接地址賦值給url_c，準備下一次循環。
                #print("下一頁",url_c)
        f.close()
        print("文件生成用時:",time.time()-start_time)
        print("小說爬取總用時:",time.time()-self.start_time)
        print(txtname + ".txt" + " 文件已生成！")
        return

    #爬蟲結束，調用content2txt方法，生成txt文件
    def close_spider(self,spider):
        if self.name_novel !='' and self.url_firstchapter != '' and self.name_txt != '':
            self.content2txt(self.name_novel,self.url_firstchapter,self.name_txt)
        return

（2）爬蟲示例代碼xbiquge/spiders/sancun.py

# -*- coding: utf-8 -*-
import scrapy
from xbiquge.items import XbiqugeItem
from xbiquge.pipelines import XbiqugePipeline
import pdb

class SancunSpider(scrapy.Spider):
    name = 'sancun'
    allowed_domains = ['www.xbiquge.la']
    #start_urls = ['https://www.xbiquge.la/10/10489/']
    url_ori= "https://www.xbiquge.la"
    url_firstchapter = "https://www.xbiquge.la/10/10489/4534454.html"
    name_txt = "./novels/三寸人間"
    index_FS = url_firstchapter.rfind('/')  #從右到左定位第一個正斜杠的位置
    #url_chapters = url_firstchapter[0:32]  #截取字符串包括尾部的正斜杠
    url_chapters = url_firstchapter[0:index_FS+1]  #截取目錄頁面字符串，包括尾部的正斜杠
    pipeline=XbiqugePipeline()
    novelcollection=pipeline.get_collection(name) #獲取小說數據集cursor對象，mongodb的數據集（collection）相當于mysql的數據表table
    #--------------------------------------------
    #如果next_page的值是小說目錄頁面url，則把包含目錄頁面的記錄刪除，以免再次抓取時，出現多個目錄頁面url，使得無法獲得最新內容。 
    if novelcollection.find({"next_page":url_chapters}).count() != 0 :
        print("包含目錄頁面url的記錄:",novelcollection.find({"next_page":url_chapters},{"_id":0,"id":1,"url":1,"next_page":1}).next())
#        pdb.set_trace()
        novelcollection.remove({"next_page":url_chapters})
        print("已刪除包含目錄頁面url的記錄。")
    #--------------------------------------------
    novelcounts=novelcollection.find().count()
    novelurls=novelcollection.find({},{"_id":0,"id":1,"url":1})
    item = XbiqugeItem()
    item['id'] = novelcounts         #id置初值為colletion的記錄總數
    item['name'] = name
    item['url_firstchapter'] = url_firstchapter
    item['name_txt'] = name_txt

    def start_requests(self):
        start_urls = [self.url_chapters]
        print("小說目錄url:",start_urls)
        for url in start_urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):    #網頁提取數據，并與mongodb數據集比較，沒有相同的數據才從網頁抓取。
        f = open("/root/xbiquge_w/url_list.txt","w")   #打開文件，以便寫入抓取頁面url
        count_bingo=0   #數據集中已有記錄的條數
        dl = response.css('#list dl dd')     #提取章節鏈接相關信息
        for dd in dl:
            count_iterator = 0
            self.url_c = self.url_ori + dd.css('a::attr(href)').extract()[0]   #組合形成小說的各章節鏈接
            #print("網頁提取url:", self.url_c)
            self.novelurls=self.novelcollection.find({},{"_id":0,"id":1,"url":1})   #通過重新賦值迭代器來重置迭代器指針，使for循環能夠
從頭遍歷迭代器。
            for url in self.novelurls:
                #print("mongodb提取url:", url)
                if url["url"]==self.url_c:      #如果數據集中找到與網頁提取的url值相同，則跳出循環
                    count_bingo += 1
                    count_iterator += 1
                    break
            if count_iterator != 0 :            #如果有命中結果，則繼續下一個循環，不執行爬取動作
               continue
            #print("爬取url:",self.url_c)
            f.write("爬取url:"+self.url_c+"\n")
            #yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True)
            yield scrapy.Request(self.url_c, callback=self.parse_c)    #以生成器模式（yield）調用parse_c方法獲得各章節鏈接、上一頁鏈接
、下一頁鏈接和章節內容信息。
            #print(self.url_c)
        f.close()
        print("數據集已有記錄數count_bingo:",count_bingo)

    def parse_c(self, response):
        self.item['id'] += 1
        self.item['url'] = response.url
        self.item['preview_page'] = self.url_ori + response.css('div .bottem1 a::attr(href)').extract()[1]
        self.item['next_page'] = self.url_ori + response.css('div .bottem1 a::attr(href)').extract()[3]
        title = response.css('.con_top::text').extract()[4]
        contents = response.css('#content::text').extract()
        text=''
        for content in contents:
            text = text + content
        #print(text)
        self.item['content'] = title + "\n" + text.replace('\15', '\n')     #各章節標題和內容組合成content數據，\15是^M的八進制表示，>需要替換為換行符。
        yield self.item     #以生成器模式（yield）輸出Item對象的內容給pipelines模塊。

        if self.item['url'][self.url_firstchapter.rfind('/')+1:self.url_firstchapter.rfind('.')] == self.item['next_page'][self.url_firstchapter.rfind('/')+1:self.url_firstchapter.rfind('.')]: #同一章有分頁的處理
            self.url_c = self.item['next_page']
            yield scrapy.Request(self.url_c, callback=self.parse_c)

posted @ 2021-05-28 12:00 sfccl 閱讀(126) 評論(0) 收藏舉報

刷新頁面返回頂部

sfccl

scrapy爬小說程序（mongodb版）的完善

公告