从巨潮资讯网爬取深市企业历年调研公告


从巨潮资讯网爬取深市企业历年调研公告

相比于之前从巨潮爬取上市企业的财报,本次爬取工作主要是爬取深市特有的“调研”报告。在基于这篇文章获取了上市公司基本信息表2021年上市公司基本信息表.xlsx之后,相比于这篇爬取财报,本文主要改动在于:

  1. 把对http://www.cninfo.com.cn/new/hisAnnouncement/query询问的data中的tabName改为relation
  2. 由于要爬取所有调研报告,所以加入页码切换功能;
  3. 根据对http://www.cninfo.com.cn/new/information/topSearch/detailOfQuery询问得到的企业所在平台,判断企业是否是在深市上市的,若不是就不做后续操作;
  4. 下载doc文件和docx文件的功能。

以上改动主要基于对网站的分析完成,主要代码如下:

import json
import os
from time import sleep
from urllib import parse
import requests
import xlrd
import time
 
dir = 'D:\\爬取巨潮资讯网\\' # 这个目录需要自行创建并指定

def get_adress(bank_name):    
    url = "http://www.cninfo.com.cn/new/information/topSearch/detailOfQuery"
    data = {
        'keyWord': bank_name,
        'maxSecNum': 10,
        'maxListNum': 5,
    }
    hd = {
        'Host': 'www.cninfo.com.cn',
        'Origin': 'http://www.cninfo.com.cn',
        'Pragma': 'no-cache',
        'Accept-Encoding': 'gzip,deflate',
        'Connection': 'keep-alive',
        'Content-Length': '70',
        'User-Agent': 'Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 75.0.3770.100Safari / 537.36',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'application/json,text/plain,*/*',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    }
    r = requests.post(url, headers=hd, data=data)
    r = r.content
    m = str(r, encoding="utf-8")
    pk = json.loads(m)
    orgId = pk["keyBoardList"][0]["orgId"]   #获取参数
    plate = pk["keyBoardList"][0]["plate"]
    code = pk["keyBoardList"][0]["code"]
    is_szse = pk["keyBoardList"][0]["plate"] == 'szse'
    return orgId, plate, code, is_szse

def download_PDF(url, file_name, company, orgId):   #下载pdf
    url = url
    r = requests.get(url)
    f = open(dir + company + '\\' + file_name.upper() + "_" + orgId + ".pdf", "wb")
    f.write(r.content)

def download_DOC(url, file_name, company, orgId):   #下载pdf
    url = url
    r = requests.get(url)
    f = open(dir + company + '\\' + file_name.upper() + "_" + orgId + ".doc", "wb")
    f.write(r.content)

def download_DOCX(url, file_name, company, orgId):   #下载pdf
    url = url
    r = requests.get(url)
    f = open(dir + company + '\\' + file_name.upper() + "_" + orgId + ".docx", "wb")
    f.write(r.content)

def get_FILE_num(orgId, plate, code, company, sz_or_sh):
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    data = {
        'stock': '{},{}'.format(code, orgId),
        'tabName': 'relation',
        'pageSize': 30,
        'pageNum': 1,
        'column': plate,
        'category': '',
        'plate': sz_or_sh,
        'seDate': '',
        'searchkey': '',
        'secid': '',
        'sortName': '',
        'sortType': '',
        'isHLtitle': 'true',
    }
 
    hd = {
        'Host': 'www.cninfo.com.cn',
        'Origin': 'http://www.cninfo.com.cn',
        'Pragma': 'no-cache',
        'Accept-Encoding': 'gzip,deflate',
        'Connection': 'keep-alive',
        # 'Content-Length': '216',
        'User-Agent': 'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'application/json,text/plain,*/*',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'X-Requested-With': 'XMLHttpRequest',
        # 'Cookie': cookies
    }
    data = parse.urlencode(data)
    r = requests.post(url, headers=hd, data=data)
    r = str(r.content, encoding="utf-8")
    r = json.loads(r)
    return r['totalAnnouncement']

def get_FILE(orgId, plate, code, company, sz_or_sh, pageNum):
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    data = {
        'stock': '{},{}'.format(code, orgId),
        'tabName': 'relation',
        'pageSize': 30,
        'pageNum': pageNum,
        'column': plate,
        'category': '',
        'plate': sz_or_sh,
        'seDate': '',
        'searchkey': '',
        'secid': '',
        'sortName': '',
        'sortType': '',
        'isHLtitle': 'true',
    }
 
    hd = {
        'Host': 'www.cninfo.com.cn',
        'Origin': 'http://www.cninfo.com.cn',
        'Pragma': 'no-cache',
        'Accept-Encoding': 'gzip,deflate',
        'Connection': 'keep-alive',
        # 'Content-Length': '216',
        'User-Agent': 'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'application/json,text/plain,*/*',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'X-Requested-With': 'XMLHttpRequest',
        # 'Cookie': cookies
    }
    data = parse.urlencode(data)
    r = requests.post(url, headers=hd, data=data)
    r = str(r.content, encoding="utf-8")
    r = json.loads(r)
    reports_list = r['announcements']
    for report in reports_list:
        # if '投资者关系' in report['announcementTitle']:  # http://static.cninfo.com.cn/finalpage/2019-03-29/1205958883.PDF
        if report['adjunctType'] == 'PDF':
            pdf_url = "http://static.cninfo.com.cn/" + report['adjunctUrl']
            file_name = report['announcementTitle']
            download_PDF(pdf_url, file_name, company, orgId)
        if report['adjunctType'] == 'DOC':
            doc_url = "http://static.cninfo.com.cn/" + report['adjunctUrl']
            file_name = report['announcementTitle']
            download_DOC(doc_url, file_name, company, orgId)
        if report['adjunctType'] == 'DOCX':
            doc_url = "http://static.cninfo.com.cn/" + report['adjunctUrl']
            file_name = report['announcementTitle']
            download_DOCX(doc_url, file_name, company, orgId)
        sleep(2)

file = '2021年上市公司基本信息表.xlsx'
wb = xlrd.open_workbook(filename=file)# 打开表格文件

errors = open('C:\\Users\\15617\\Desktop\\'+'异常信息_巨潮资讯网.txt', 'a', encoding='utf-8')
empties = open('C:\\Users\\15617\\Desktop\\'+'空白信息_巨潮资讯网.txt', 'a', encoding='utf-8')

sheet1 = wb.sheet_by_index(0)# 通过索引获取表格
for i in range(1577, sheet1.nrows):# sheet1.nrows
    print(i, end='\t')
    company = sheet1.row(i)[1].value.strip('*').lower()
    # company= 'st星源'
    try:
        orgId, plate, code, is_szse = get_adress(company)
        sz_or_sh = 'sh'
        if is_szse:
            sz_or_sh = 'sz'
            print(orgId, plate, code, is_szse)
            filenum = get_FILE_num(orgId, plate, code, company, sz_or_sh)
            if filenum:
                os.mkdir(dir + company)
                for page in range(filenum//30+1):
                    print(page+1)
                    get_FILE(orgId, plate, code, company, sz_or_sh, page+1)
            else:
                empties.write(company+'\n')
    except:
        print("出现异常!"+company)
        errors.write(company+'\n')
    time.sleep(2)

作为一小节吧,爬虫技术又进步啦~😆😆


评论
  目录