处理PDF doc docx截取文本得到excel


处理PDF doc docx截取文本得到excel

根据上一节爬取的深市企业调研公告结果,本节进行处理,要求把存在表格的文件进行截取,按照句号分别截取,然后得到excel。本节的工作主要在于要把doc文件转化为docx然后进行处理,而且要对不同格式的文件采用不同的处理方法。主要是一些比较繁琐的操作。

主要代码如下:

import pdfplumber
import pandas as pd
import os
import xlwt
from docx import Document
from win32com.client import Dispatch

inputdir = 'D:\\爬取巨潮资讯网\\' # 这个目录需要自行创建并指定
outputdir = 'D:\\爬取结果整理\\'
errors = open('C:\\Users\\15617\\Desktop\\'+'异常信息_excel处理.txt', 'a', encoding='utf-8')

def content_process(content):
    if ':' in content: return content.split(':')[1]
    if ':' in content: return content.split(':')[1]
    if '?' in content: return content.split('?')[1]
    if '?' in content: return content.split('?')[1]
    return content

def pdf_read(filename, excelname):
    try:
        pdf = pdfplumber.open(filename)
        # page = pdf.pages[0].extract_table()
        table = ['', '']
        for page in pdf.pages:
            for temp in page.extract_tables():
                for line in temp:
                    table[0] += line[0]
                    table[1] += line[1]
    except:
        global errors
        errors.write(filename+'\n')
        return

    if '投资者关系活动类别' not in table[0]: 
        return

    wb = xlwt.Workbook()
    sheet = wb.add_sheet('Sheet1')
    index = 0
    contents = table[1].replace(' ', '').replace('\n', '').split('。')
    # if '简介' not in contents and '介绍' not in contents and '情况' not in contents and '总结' not in contents and 
    for content in contents:
        if content == '': continue
        content = content_process(content)
        loc = 'A' + str(index)
        sheet.write(index, 0, content) 
        # sheet.write(index, 1, '') 
        index += 1

    pdf.close()
    wb.save(excelname)

def word_read(filename, excelname):
    try:
        file = Document(filename)
        table = file.tables[0]
        contents = table.cell(5, 1).text.replace(' ', '').replace('\n', '').split('。')
    except:
        global errors
        errors.write(filename+'\n')
        return

    if '投资' not in table.cell(0, 0).text: 
        return

    wb = xlwt.Workbook()
    sheet = wb.add_sheet('Sheet1')
    index = 0
    for content in contents:
        if content == '': continue
        content = content_process(content)
        loc = 'A' + str(index)
        sheet.write(index, 0, content)
        # sheet.write(index, 1, '')
        index += 1

    wb.save(excelname)

# pdf_read('000001平安银行调研活动信息20210129(1)_gssz0000001.pdf', '000001平安银行调研活动信息20210129(1)_gssz0000001.xls')
# word_read('000032深桑达A调研活动信息20210118_gssz0000032.docx', '000032深桑达A调研活动信息20210118_gssz0000032.xls')

wd = Dispatch("Word.application")
wd.Visible = 0
wd.DisplayAlerts = 0

companies = os.listdir(inputdir)
# companies = ['st北文']
for companyname in companies:
    print(companyname)
    companyinputdir = inputdir + companyname + '\\'
    companyoutputdir = outputdir + companyname + '\\'
    inputfiles = os.listdir(companyinputdir)
    if not len(inputfiles): continue
    os.mkdir(companyoutputdir)
    for inputfile in inputfiles:
        outputfile = companyoutputdir + inputfile.split('.')[0] + '.xls'
        if '.pdf' in inputfile:
            pdf_read(companyinputdir + inputfile, outputfile)
        elif '.docx' in inputfile:
            word_read(companyinputdir + inputfile, outputfile)
        elif '.doc' in inputfile:
            # 打开doc文档, 必须给一个绝对路径
            if not os.path.exists(companyinputdir + inputfile): continue
            try:
                doc = wd.Documents.Open(companyinputdir + inputfile)
            except:
                errors.write(companyinputdir + inputfile + '\n')
                continue
            creatfile = companyinputdir + inputfile + 'x'
            if not os.path.exists(creatfile):
                doc.SaveAs(creatfile, 12) #12表示docx格式
            doc.Close()
            os.remove(companyinputdir + inputfile)
            # 退出word应用
            word_read(companyinputdir + inputfile + 'x', outputfile)

wd.Quit()

做一个简单记录吧,就当练手啦😆😆


评论
  目录