从巨潮资讯网爬取深市企业历年调研公告
相比于之前从巨潮爬取上市企业的财报,本次爬取工作主要是爬取深市特有的“调研”报告。在基于这篇文章获取了上市公司基本信息表2021年上市公司基本信息表.xlsx
之后,相比于这篇爬取财报,本文主要改动在于:
- 把对
http://www.cninfo.com.cn/new/hisAnnouncement/query
询问的data
中的tabName
改为relation
; - 由于要爬取所有调研报告,所以加入页码切换功能;
- 根据对
http://www.cninfo.com.cn/new/information/topSearch/detailOfQuery
询问得到的企业所在平台,判断企业是否是在深市上市的,若不是就不做后续操作; - 下载
doc
文件和docx
文件的功能。
以上改动主要基于对网站的分析完成,主要代码如下:
import json
import os
from time import sleep
from urllib import parse
import requests
import xlrd
import time
dir = 'D:\\爬取巨潮资讯网\\' # 这个目录需要自行创建并指定
def get_adress(bank_name):
url = "http://www.cninfo.com.cn/new/information/topSearch/detailOfQuery"
data = {
'keyWord': bank_name,
'maxSecNum': 10,
'maxListNum': 5,
}
hd = {
'Host': 'www.cninfo.com.cn',
'Origin': 'http://www.cninfo.com.cn',
'Pragma': 'no-cache',
'Accept-Encoding': 'gzip,deflate',
'Connection': 'keep-alive',
'Content-Length': '70',
'User-Agent': 'Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 75.0.3770.100Safari / 537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json,text/plain,*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
r = requests.post(url, headers=hd, data=data)
r = r.content
m = str(r, encoding="utf-8")
pk = json.loads(m)
orgId = pk["keyBoardList"][0]["orgId"] #获取参数
plate = pk["keyBoardList"][0]["plate"]
code = pk["keyBoardList"][0]["code"]
is_szse = pk["keyBoardList"][0]["plate"] == 'szse'
return orgId, plate, code, is_szse
def download_PDF(url, file_name, company, orgId): #下载pdf
url = url
r = requests.get(url)
f = open(dir + company + '\\' + file_name.upper() + "_" + orgId + ".pdf", "wb")
f.write(r.content)
def download_DOC(url, file_name, company, orgId): #下载pdf
url = url
r = requests.get(url)
f = open(dir + company + '\\' + file_name.upper() + "_" + orgId + ".doc", "wb")
f.write(r.content)
def download_DOCX(url, file_name, company, orgId): #下载pdf
url = url
r = requests.get(url)
f = open(dir + company + '\\' + file_name.upper() + "_" + orgId + ".docx", "wb")
f.write(r.content)
def get_FILE_num(orgId, plate, code, company, sz_or_sh):
url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
data = {
'stock': '{},{}'.format(code, orgId),
'tabName': 'relation',
'pageSize': 30,
'pageNum': 1,
'column': plate,
'category': '',
'plate': sz_or_sh,
'seDate': '',
'searchkey': '',
'secid': '',
'sortName': '',
'sortType': '',
'isHLtitle': 'true',
}
hd = {
'Host': 'www.cninfo.com.cn',
'Origin': 'http://www.cninfo.com.cn',
'Pragma': 'no-cache',
'Accept-Encoding': 'gzip,deflate',
'Connection': 'keep-alive',
# 'Content-Length': '216',
'User-Agent': 'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json,text/plain,*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'X-Requested-With': 'XMLHttpRequest',
# 'Cookie': cookies
}
data = parse.urlencode(data)
r = requests.post(url, headers=hd, data=data)
r = str(r.content, encoding="utf-8")
r = json.loads(r)
return r['totalAnnouncement']
def get_FILE(orgId, plate, code, company, sz_or_sh, pageNum):
url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
data = {
'stock': '{},{}'.format(code, orgId),
'tabName': 'relation',
'pageSize': 30,
'pageNum': pageNum,
'column': plate,
'category': '',
'plate': sz_or_sh,
'seDate': '',
'searchkey': '',
'secid': '',
'sortName': '',
'sortType': '',
'isHLtitle': 'true',
}
hd = {
'Host': 'www.cninfo.com.cn',
'Origin': 'http://www.cninfo.com.cn',
'Pragma': 'no-cache',
'Accept-Encoding': 'gzip,deflate',
'Connection': 'keep-alive',
# 'Content-Length': '216',
'User-Agent': 'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json,text/plain,*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'X-Requested-With': 'XMLHttpRequest',
# 'Cookie': cookies
}
data = parse.urlencode(data)
r = requests.post(url, headers=hd, data=data)
r = str(r.content, encoding="utf-8")
r = json.loads(r)
reports_list = r['announcements']
for report in reports_list:
# if '投资者关系' in report['announcementTitle']: # http://static.cninfo.com.cn/finalpage/2019-03-29/1205958883.PDF
if report['adjunctType'] == 'PDF':
pdf_url = "http://static.cninfo.com.cn/" + report['adjunctUrl']
file_name = report['announcementTitle']
download_PDF(pdf_url, file_name, company, orgId)
if report['adjunctType'] == 'DOC':
doc_url = "http://static.cninfo.com.cn/" + report['adjunctUrl']
file_name = report['announcementTitle']
download_DOC(doc_url, file_name, company, orgId)
if report['adjunctType'] == 'DOCX':
doc_url = "http://static.cninfo.com.cn/" + report['adjunctUrl']
file_name = report['announcementTitle']
download_DOCX(doc_url, file_name, company, orgId)
sleep(2)
file = '2021年上市公司基本信息表.xlsx'
wb = xlrd.open_workbook(filename=file)# 打开表格文件
errors = open('C:\\Users\\15617\\Desktop\\'+'异常信息_巨潮资讯网.txt', 'a', encoding='utf-8')
empties = open('C:\\Users\\15617\\Desktop\\'+'空白信息_巨潮资讯网.txt', 'a', encoding='utf-8')
sheet1 = wb.sheet_by_index(0)# 通过索引获取表格
for i in range(1577, sheet1.nrows):# sheet1.nrows
print(i, end='\t')
company = sheet1.row(i)[1].value.strip('*').lower()
# company= 'st星源'
try:
orgId, plate, code, is_szse = get_adress(company)
sz_or_sh = 'sh'
if is_szse:
sz_or_sh = 'sz'
print(orgId, plate, code, is_szse)
filenum = get_FILE_num(orgId, plate, code, company, sz_or_sh)
if filenum:
os.mkdir(dir + company)
for page in range(filenum//30+1):
print(page+1)
get_FILE(orgId, plate, code, company, sz_or_sh, page+1)
else:
empties.write(company+'\n')
except:
print("出现异常!"+company)
errors.write(company+'\n')
time.sleep(2)
作为一小节吧,爬虫技术又进步啦~😆😆