-
Notifications
You must be signed in to change notification settings - Fork 1
/
table_recognition_2.0.py
183 lines (133 loc) · 5.27 KB
/
table_recognition_2.0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 20 16:11:45 2019
tkinter 图形界面
@author: situ
"""
from pandas import Series,DataFrame,ExcelWriter
import os
import re
from json import loads
from base64 import b64encode
##导入腾讯AI api
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.ocr.v20181119 import ocr_client, models
from tkinter import filedialog,Tk,Label,Button,Menu,Entry
#定义函数,来自于官方文档
def excelFromPictures(path,picture):
SecretId = ""
SecretKey = ""
with open(picture,"rb") as f:
img_data = f.read()
img_base64 = b64encode(img_data)
cred = credential.Credential(SecretId, SecretKey) #ID和Secret从腾讯云申请
httpProfile = HttpProfile()
httpProfile.endpoint = "ocr.tencentcloudapi.com"
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
client = ocr_client.OcrClient(cred, "ap-shanghai", clientProfile)
req = models.TableOCRRequest()
params = '{"ImageBase64":"' + str(img_base64, 'utf-8') + '"}'
req.from_json_string(params)
# false=0
try:
resp = client.TableOCR(req)
# print(resp.to_json_string())
except TencentCloudSDKException as err:
print("错误[",err,"]\n可重试")
##提取识别出的数据,并且生成json
result1 = loads(resp.to_json_string())
#RowTl表示数据所有行索引,ColTl表示数据所在列索引,Text为数据
rowIndex = []
colIndex = []
content = []
for item in result1['TextDetections']:
rowIndex.append(item['RowTl'])
colIndex.append(item['ColTl'])
content.append(item['Text'])
##导出Excel
##ExcelWriter方案
rowIndex = Series(rowIndex)
colIndex = Series(colIndex)
index = rowIndex.unique()
index.sort()
columns = colIndex.unique()
columns.sort()
data = DataFrame(index = index, columns = columns)
for i in range(len(rowIndex)):
data.loc[rowIndex[i],colIndex[i]] = re.sub(" ","",content[i])
writer = ExcelWriter(path+"/tables/" +re.match(".*\.",f.name).group()+"xlsx", engine='xlsxwriter')
data.to_excel(writer,sheet_name = 'Sheet1', index=False,header = False)
writer.save()
print("已经完成" + f.name + "的提取")
# 查单张 输入表格图片路径
def one_pic():
#picture_path = input("请输入表格图片路径:")
picture_path = entry_filename1.get()
print(picture_path)
picture_name = os.path.basename(picture_path)
path = os.path.dirname(picture_path)
os.chdir(path)
table_path = os.path.join(path,"tables")
if not os.path.exists(table_path):
os.mkdir(table_path)
excelFromPictures(path,picture_name)
# 查多张 输入图片文件夹路径
def batch():
file_str = entry_filename2.get()
# print(file_str)
file_names = re.split(r"[{} ]",file_str)
# print(file_names)
file_names = [f.lstrip() for f in file_names if f not in [""," "]]
file_names = [f.rstrip() for f in file_names]
pictures_path = os.path.dirname(file_names[0])
path = os.path.dirname(pictures_path)
os.chdir(pictures_path)
pictures = [os.path.basename(f) for f in file_names]
table_path = os.path.join(path,"tables")
if not os.path.exists(table_path):
os.mkdir(table_path)
for pic in pictures:
excelFromPictures(path,pic)
window = Tk()
window.title('表格识别神器')
window.geometry('300x200')
def file_input_one():
filename = filedialog.askopenfilename(title='导入图片文件')
entry_filename1.insert('insert', filename)
def file_input_batch():
filename = filedialog.askopenfilenames(title='导入图片文件')
entry_filename2.insert('insert', filename)
menubar = Menu(window)
filemenu = Menu(menubar, tearoff=0)
menubar.add_cascade(label='File', menu=filemenu)
filemenu.add_command(label='Open_one_file', command=file_input_one)
filemenu.add_command(label='Open_files', command=file_input_batch)
window.config(menu=menubar)
l1 = Label(window, text="单张表格图片识别",font=("宋体", 10, 'bold'))
l1.grid(column=0, row=0)
entry_filename1 = Entry(window, width=30,font=("arial", 10))
entry_filename1.grid(column=0, row=1)
b1 = Button(window, text="开始识别",command=one_pic)
b1.grid(column=1, row=1)
l2 = Label(window, text="批量表格图片识别",font=("宋体", 10, 'bold'))
l2.grid(column=0, row=2)
entry_filename2 = Entry(window, width=30,font=("arial", 10))
entry_filename2.grid(column=0, row=3)
def test_batch():
file_str = entry_filename2.get()
print(file_str)
file_names = re.split(r"[{} ]",file_str)
print(file_names)
b2 = Button(window, text="开始识别",command=batch)
b2.grid(column=1, row=3)
tips = Label(window, text="注:图片名称中不允许有空格",font=("仿宋", 8))
tips.grid(column=0,row=4)
window.mainloop()
# 打包代码:
#pyinstaller -p C:/Users/situ/Anaconda2/envs/py3/Lib/site-packages -D table_recognition_2.0.py
#生成的exe文件在:
#cd E:/self_programming/table_tkinter/dist/table_recognition_2.0.exe