处理PDF doc docx截取文本得到excel
根据上一节爬取的深市企业调研公告结果,本节进行处理,要求把存在表格的文件进行截取,按照句号分别截取,然后得到excel。本节的工作主要在于要把doc文件转化为docx然后进行处理,而且要对不同格式的文件采用不同的处理方法。主要是一些比较繁琐的操作。
主要代码如下:
import pdfplumber
import pandas as pd
import os
import xlwt
from docx import Document
from win32com.client import Dispatch
inputdir = 'D:\\爬取巨潮资讯网\\' # 这个目录需要自行创建并指定
outputdir = 'D:\\爬取结果整理\\'
errors = open('C:\\Users\\15617\\Desktop\\'+'异常信息_excel处理.txt', 'a', encoding='utf-8')
def content_process(content):
if ':' in content: return content.split(':')[1]
if ':' in content: return content.split(':')[1]
if '?' in content: return content.split('?')[1]
if '?' in content: return content.split('?')[1]
return content
def pdf_read(filename, excelname):
try:
pdf = pdfplumber.open(filename)
# page = pdf.pages[0].extract_table()
table = ['', '']
for page in pdf.pages:
for temp in page.extract_tables():
for line in temp:
table[0] += line[0]
table[1] += line[1]
except:
global errors
errors.write(filename+'\n')
return
if '投资者关系活动类别' not in table[0]:
return
wb = xlwt.Workbook()
sheet = wb.add_sheet('Sheet1')
index = 0
contents = table[1].replace(' ', '').replace('\n', '').split('。')
# if '简介' not in contents and '介绍' not in contents and '情况' not in contents and '总结' not in contents and
for content in contents:
if content == '': continue
content = content_process(content)
loc = 'A' + str(index)
sheet.write(index, 0, content)
# sheet.write(index, 1, '')
index += 1
pdf.close()
wb.save(excelname)
def word_read(filename, excelname):
try:
file = Document(filename)
table = file.tables[0]
contents = table.cell(5, 1).text.replace(' ', '').replace('\n', '').split('。')
except:
global errors
errors.write(filename+'\n')
return
if '投资' not in table.cell(0, 0).text:
return
wb = xlwt.Workbook()
sheet = wb.add_sheet('Sheet1')
index = 0
for content in contents:
if content == '': continue
content = content_process(content)
loc = 'A' + str(index)
sheet.write(index, 0, content)
# sheet.write(index, 1, '')
index += 1
wb.save(excelname)
# pdf_read('000001平安银行调研活动信息20210129(1)_gssz0000001.pdf', '000001平安银行调研活动信息20210129(1)_gssz0000001.xls')
# word_read('000032深桑达A调研活动信息20210118_gssz0000032.docx', '000032深桑达A调研活动信息20210118_gssz0000032.xls')
wd = Dispatch("Word.application")
wd.Visible = 0
wd.DisplayAlerts = 0
companies = os.listdir(inputdir)
# companies = ['st北文']
for companyname in companies:
print(companyname)
companyinputdir = inputdir + companyname + '\\'
companyoutputdir = outputdir + companyname + '\\'
inputfiles = os.listdir(companyinputdir)
if not len(inputfiles): continue
os.mkdir(companyoutputdir)
for inputfile in inputfiles:
outputfile = companyoutputdir + inputfile.split('.')[0] + '.xls'
if '.pdf' in inputfile:
pdf_read(companyinputdir + inputfile, outputfile)
elif '.docx' in inputfile:
word_read(companyinputdir + inputfile, outputfile)
elif '.doc' in inputfile:
# 打开doc文档, 必须给一个绝对路径
if not os.path.exists(companyinputdir + inputfile): continue
try:
doc = wd.Documents.Open(companyinputdir + inputfile)
except:
errors.write(companyinputdir + inputfile + '\n')
continue
creatfile = companyinputdir + inputfile + 'x'
if not os.path.exists(creatfile):
doc.SaveAs(creatfile, 12) #12表示docx格式
doc.Close()
os.remove(companyinputdir + inputfile)
# 退出word应用
word_read(companyinputdir + inputfile + 'x', outputfile)
wd.Quit()
做一个简单记录吧,就当练手啦😆😆