import json
import os.path
import time
from jsonpath import *
# import jsonpath as jsonpath
import pandas as pd
import requests
# url = "http://www.whggzy.com/front/search/category"
def get_resp(url,name,i):
headers = {
"Referer": "http://www.whggzy.com/PoliciesAndRegulations/index.html?utm=sites_group_front.26a79a93.0.0.715108e02e0e11ee837be5c5ca3fd993",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Accept": "*/*",
"Content-Type": "application/json",
"X-Requested-With": "XMLHttpRequest"
}
data = {
"utm":"sites_group_front.26a79a93.0.0.715108e02e0e11ee837be5c5ca3fd993",
"categoryCode":f"{name}",
"pageSize":15,
"pageNo":f"{i}"
}
# json = data,json傳參就算將參數轉化為json格式進行傳遞的
resp = requests.post(url, headers=headers, json=data).json()
return resp
def save_json(content):
data = json.dumps(content)
with open("wh_data.json",'w',encoding="utf-8") as w:
w.write(data)
def get_data(data_list,csv_path,i):
base_url = 'http://www.whggzy.com/'
pathName = ''
for data in data_list:
pathName = jsonpath(data,'$..pathName')[0] if jsonpath(data,'$..pathName') else None
title = jsonpath(data,'$..title')[0] if jsonpath(data,'$..title') else None
publishDate = jsonpath(data,'$..publishDate')[0] if jsonpath(data,'$..publishDate') else None
date = time.strftime('%Y-%m-%d',time.localtime(publishDate / 1000))
attachmentUrl = jsonpath(data,'$..attachmentUrl')[0] if jsonpath(data,'$..attachmentUrl') else None
url = base_url + jsonpath(data,'$..url')[0] if jsonpath(data,'$..url') else None
csv_list = [pathName,title,date,attachmentUrl,url]
save_csv(csv_list,csv_path)
print(f'政策法規-->>{pathName}-->> 第{i}頁下爬取完畢 !!!')
def judge_csv_file():
# 當前腳本文件的絕對路徑,_file_代表的是appLogger 這個文件
current_path = os.path.abspath(__file__)
# 定義一個類屬性,保存的文件名稱
csv_path = os.path.join(os.path.abspath(os.path.dirname(current_path)),
'wh_data.csv')
print(csv_path)
if not os.path.exists(csv_path):
head_list = ['項目','標題','日期','附件網址','內容地址']
tb_head = ",".join(head_list) + '\n'
with open(csv_path,'w',encoding="utf-8") as wf:
wf.write(tb_head)
return csv_path
def save_csv(data_list,csv_path):
data = pd.DataFrame(data=[data_list])
# 追加數據,mode = 'a',表示追加,index=False 表示不給每行數據加索引序號,header=False 表示不加標題
data.to_csv(csv_path,mode='a',index=False,header=False,encoding='utf-8')
def run(url):
csv_path = judge_csv_file()
name_list = ["GovernmentProcurement","BidAndEngineerConstruction","LandAndMineralRightsTransaction",
"TransactionOfPropertyRights","TransactionOfPublicResources"]
for name in name_list:
i = 1
while True:
content = get_resp(url,name,i)
save_json(content)
data_list = content['hits']['hits']
if data_list:
get_data(data_list,csv_path,i)
else:
break
i += 1
if __name__ == '__main__':
url = "http://www.whggzy.com/front/search/category"
run(url)