python 爬取原力文檔日語學習資料
參考
https://blog.csdn.net/weixin_46184311/article/details/115291441
代碼
import requests, json, re, time, urllib.request
import time
import wget
def getParameter(url): # 獲取文檔參數
text_response = requests.get(url=url, headers=headers).text
actual_page = int(re.search('actual_page: (\d+), //真實頁數', text_response).group(1)) # 頁數
aid = re.search('aid: (\d+), //解密后的id', text_response).group(1) # aid
view_token = re.search('view_token: \'(.*?)\'', text_response).group(1) # view_token
print('actual_page:', actual_page, '\naid:', aid, '\nview_token:', view_token)
return actual_page, aid, view_token
def requests_data(parameter, page): # 請求數據
url = 'https://openapi.book118.com/getPreview.html'
params = {
'project_id': '1',
'aid': parameter[1],
'view_token': parameter[2],
'page': page,
}
response = requests.get(url=url, headers=headers, params=params).text
json_data = re.search('jsonpReturn\((.*?)\);', response).group(1) # 使用正則表達式所需數據
data = json.loads(json_data)['data']
return data
if __name__ == '__main__':
results = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
text_url = 'https://max.book118.com/html/2023/0208/6230222112005044.shtm'
parameter = getParameter(text_url)
print(parameter)
for page in range(1, parameter[0]+1, 6):
print(page)
result = requests_data(parameter, page)
print(result)
for id, url in result.items():
url = 'https:'+url
print(f'downloading: {id}, {url} ')
wget.download(url=url, out=f'imgs/{id}.png')
time.sleep(1)

浙公網安備 33010602011771號