??作業(yè)①
??(1)作業(yè)要求
- 要求:指定一個(gè)網(wǎng)站,爬取這個(gè)網(wǎng)站中的所有的所有圖片,例如:中國氣象網(wǎng)(http://www.weather.com.cn)。使用scrapy框架分別實(shí)現(xiàn)單線程和多線程的方式爬取。
務(wù)必控制總頁數(shù)(學(xué)號尾數(shù)2位)、總下載的圖片數(shù)量(尾數(shù)后3位)等限制爬取的措施。 - 輸出信息: 將下載的Url信息在控制臺輸出,并將下載的圖片存儲在images子文件中,并給出截圖。
- Gitee 文件夾鏈接
??
(2)代碼實(shí)現(xiàn)及圖片
- 多線程
import scrapy
from ..items import Work1Item
from concurrent.futures import ThreadPoolExecutor
class MySpider(scrapy.Spider):
# 爬蟲的名字 一般運(yùn)行爬蟲的時(shí)候 使用的值
name = 'MySpider'
start_urls = []
for i in range(1,3):
url = f"https://www.amazon.cn/s?k=%E4%B9%A6%E5%8C%85&page={i}&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&crid=1RAID9NTPCARM&qid=1698238172&sprefix=%E4%B9%A6%E5%8C%85%2Caps%2C154&ref=sr_pg_{i}"
start_urls.append(url)
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
self.executor = ThreadPoolExecutor(max_workers=4)
def parse(self, response):
src = response.xpath('//img/@src').extract()
#print(src)
img = Work1Item(src=src)
yield img
def process_request(self, request, spider):
# 利用線程池異步發(fā)送請求
self.executor.submit(spider.crawler.engine.download, request, spider)
- 單線程
import scrapy
from ..items import Work1Item
class MySpider(scrapy.Spider):
# 爬蟲的名字
name = 'MySpider'
start_urls = []
# 構(gòu)造start_urls
for i in range(1, 3):
url = f"https://www.amazon.cn/s?k=%E4%B9%A6%E5%8C%85&page={i}&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&crid=1RAID9NTPCARM&qid=1698238172&sprefix=%E4%B9%A6%E5%8C%85%2Caps%2C154&ref=sr_pg_{i}"
start_urls.append(url)
def parse(self, response):
# 提取圖片URL
src = response.xpath('//img/@src').extract()
img = Work1Item(src=src)
yield img
# 不需要process_request方法





??(3)心得體會
通過構(gòu)建MySpider類,我對 Scrapy 框架的核心組件和運(yùn)行機(jī)制有了更清晰的認(rèn)識。name屬性定義了爬蟲的唯一標(biāo)識,start_urls列表的構(gòu)建讓我學(xué)會了如何確定初始的爬取頁面集合,這是整個(gè)爬蟲啟動的基礎(chǔ)。而parse方法則是數(shù)據(jù)提取與頁面鏈接跟進(jìn)的核心邏輯所在,在其中熟練運(yùn)用xpath表達(dá)式從網(wǎng)頁的 HTML 結(jié)構(gòu)中精準(zhǔn)地提取圖片的 URL 以及下一頁鏈接,極大地提升了我對網(wǎng)頁數(shù)據(jù)提取技術(shù)的掌握程度,并且深刻體會到了xpath在處理結(jié)構(gòu)化網(wǎng)頁數(shù)據(jù)時(shí)的強(qiáng)大與便捷。
??作業(yè)②
???(1)作業(yè)要求
- 要求:熟練掌握 scrapy 中 Item、Pipeline 數(shù)據(jù)的序列化輸出方法;使用scrapy框架+Xpath+MySQL數(shù)據(jù)庫存儲技術(shù)路線爬取股票相關(guān)信息。
- 候選網(wǎng)站:東方財(cái)富網(wǎng):https://www.eastmoney.com/
新浪股票:http://finance.sina.com.cn/stock/ - 輸出信息:
MySQL數(shù)據(jù)庫存儲和輸出格式如下:
表頭英文命名例如:序號id,股票代碼:bStockNo……,由同學(xué)們自行定義設(shè)計(jì)
| 序號 | 股票代碼 | 股票名稱 | 最新報(bào)價(jià) | 漲跌幅 | 漲跌額 | 成交量 | 振幅 | 最高價(jià) | 最低價(jià) | 今開 | 昨收 |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 688093 | N世華 | 28.47 | +10.92% | +2.99 | 7.60億 | 22.34% | 32.00 | 28.08 | 30.20 | 17.55 |
??(2)代碼實(shí)現(xiàn)及圖片
主要代碼
- 爬起股票.py
import sqlite3
import requests
import re
def getHtml(url):
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47",
"Cookie": "qgqp_b_id=4a3c0dd089eb5ffa967fcab7704d27cd; st_si=19699330068294; st_asi=delete; st_pvi=76265126887030; st_sp=2021-12-18%2022%3A56%3A16; st_inirUrl=https%3A%2F%2Fcn.bing.com%2F; st_sn=2; st_psi=20231007141245108-113200301321-7681547675"}
resp = requests.get(url,headers=header)
html = resp.text
return html
global num
def getContent(html):
stocks = re.findall("\"diff\":\[(.*?)]",html)
#print(stocks)
stocks = list(eval(stocks[0]))
#print(stocks)
num = 0
result = []
for stock in stocks:
num += 1
daima = stock["f12"]
name = stock["f14"]
newprice = stock["f2"]
diefu = stock["f3"]
dieer = stock["f4"]
chengjiaoliang = stock["f5"]
chengjiaoer = stock["f6"]
zhenfu = stock["f7"]
max = stock["f15"]
min = stock["f16"]
today = stock["f17"]
yesterday = stock["f18"]
result.append([num,daima,name,newprice,diefu,dieer,chengjiaoliang,chengjiaoer,zhenfu,max,min,today,yesterday])
return result
class stockDB:
def openDB(self):
self.con = sqlite3.connect("stocks.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute("create table stocks (Num varchar(16), stockCode varchar(16),stockName varchar(16),Newprice varchar(16),RiseFallpercent varchar(16),RiseFall varchar(16),Turnover varchar(16),Dealnum varchar(16),Amplitude varchar(16),max varchar(16),min varchar(16),today varchar(16),yesterday varchar(16))")
except:
self.cursor.execute("delete from stocks")
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self,Num,stockcode,stockname,newprice,risefallpercent,risefall,turnover,dealnum,Amplitude,max,min,today,yesterday):
try:
self.cursor.execute("insert into stocks(Num,stockCode,stockName,Newprice,RiseFallpercent,RiseFall,Turnover,Dealnum,Amplitude,max,min,today,yesterday) values (?,?,?,?,?,?,?,?,?,?,?,?,?)",
(Num,stockcode,stockname,newprice,risefallpercent,risefall,turnover,dealnum,Amplitude,max,min,today,yesterday))
except Exception as err:
print(err)
s = "{0:}\t{1:{13}^8}\t{2:{13}^10}\t{3:{13}^10}\t{4:{13}^10}\t{5:{13}^10}\t{6:{13}^10}\t{7:{13}^10}\t{8:{13}^10}\t{9:{13}^10}\t{10:{13}^10}\t{11:{13}^10}\t{12:{13}^10}"
print(s.format("序號","股票代碼","股票名稱","最新價(jià)","漲跌幅","漲跌額","成交量","成交額","振幅","最高","最低","今開","昨收",chr(12288)))
stockdb = stockDB() # 創(chuàng)建數(shù)據(jù)庫對象
stockdb.openDB() # 開啟數(shù)據(jù)庫
for page in range(1,3):
url = "http://45.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124030395806868839914_1696659472380&pn=" + str(page)+ "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696659472381"
html = getHtml(url)
stocks = getContent(html)
for stock in stocks:
print(s.format(stock[0],stock[1],stock[2],stock[3],stock[4],stock[5],stock[6],stock[7],stock[8],stock[9],stock[10],stock[11],stock[12],chr(12288)))
stockdb.insert(stock[0],stock[1],stock[2],stock[3],stock[4],stock[5],stock[6],stock[7],stock[8],stock[9],stock[10],stock[11],stock[12])
# 存入數(shù)據(jù)庫
stockdb.closeDB()
- run.py
from scrapy import cmdline
cmdline.execute("scrapy crawl MySpider -s LOG_ENABLED=True".split())
輸出結(jié)果


??(3)心得體會
使用requests庫進(jìn)行網(wǎng)絡(luò)請求,通過設(shè)置合適的User-Agent和Cookie信息,成功獲取到目標(biāo)網(wǎng)頁的 HTML 內(nèi)容。這讓我熟悉了如何模擬瀏覽器行為,繞過一些簡單的反爬蟲機(jī)制,確保能夠穩(wěn)定地獲取數(shù)據(jù)。運(yùn)用正則表達(dá)式re模塊從復(fù)雜的 HTML 文本中提取出關(guān)鍵的股票數(shù)據(jù)信息。通過編寫精準(zhǔn)的正則表達(dá)式模式,如"diff":[(.*?)],能夠高效地從返回的數(shù)據(jù)中篩選出股票列表數(shù)據(jù),并進(jìn)一步處理成結(jié)構(gòu)化的數(shù)據(jù)格式,這極大地鍛煉了我對正則表達(dá)式的運(yùn)用能力和對復(fù)雜文本數(shù)據(jù)的解
??作業(yè)③
?????(1)作業(yè)要求
- 要求:熟練掌握 scrapy 中 Item、Pipeline 數(shù)據(jù)的序列化輸出方法;使用scrapy框架+Xpath+MySQL數(shù)據(jù)庫存儲技術(shù)路線爬取外匯網(wǎng)站數(shù)據(jù)。
- 候選網(wǎng)站:中國銀行網(wǎng):https://www.boc.cn/sourcedb/whpj/
- 輸出信息:

??(2)代碼實(shí)現(xiàn)及圖片
主要代碼
- MySpider.py
import scrapy
from work3.items import Work3Item
from work3.pipelines import Work3Pipeline
class MySpider(scrapy.Spider):
name = 'MySpider'
start_urls =["https://www.boc.cn/sourcedb/whpj/"]
def parse(self, response):
waihuidb = Work3Pipeline() # 創(chuàng)建數(shù)據(jù)庫對象
waihuidb.openDB(MySpider) # 開啟數(shù)據(jù)庫
items = response.xpath('//tr[position()>1]')
for i in items:
item = Work3Item()
item['Currency'] = i.xpath('.//td[1]/text()').get()
item['TBP'] = i.xpath('.//td[2]/text()').get()
item['CBP']= i.xpath('.//td[3]/text()').get()
item['TSP']= i.xpath('.//td[4]/text()').get()
item['CSP']=i.xpath('.//td[5]/text()').get()
item['Time']=i.xpath('.//td[8]/text()').get()
print(item)
waihuidb.process_item(item,MySpider)
yield item
- middlewares.py
from scrapy import signals
from itemadapter import is_item, ItemAdapter
class Work3SpiderMiddleware:
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class Work3DownloaderMiddleware:
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
代碼實(shí)現(xiàn)圖片


???(3)心得體會
在MySpider類中,我熟練地定義了爬蟲的名稱name以及起始 URL 列表start_urls,明確了數(shù)據(jù)爬取的起點(diǎn)。通過parse方法,運(yùn)用xpath表達(dá)式對目標(biāo)網(wǎng)頁(中國銀行外匯牌價(jià)頁面)進(jìn)行解析。能夠精準(zhǔn)地定位到表格中的每一行數(shù)據(jù),并提取出貨幣種類Currency、各種買賣價(jià)格(TBP、CBP、TSP、CSP)以及時(shí)間Time等關(guān)鍵信息,將其封裝到Work3Item對象中。這一過程讓我對xpath在網(wǎng)頁數(shù)據(jù)提取方面的強(qiáng)大功能有了更深入的理解和熟練的運(yùn)用,能夠快速根據(jù)網(wǎng)頁結(jié)構(gòu)變化調(diào)整提取策略,增強(qiáng)了應(yīng)對不同網(wǎng)頁結(jié)構(gòu)的解析能力。
浙公網(wǎng)安備 33010602011771號