-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
196 lines (149 loc) · 6.91 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"""
Team member: Pitnaree, Shafna, Eva, Cassie
Hackther Date: 5th Jul - 7th Jul
"""
#----------------------------------------------------------------------------------------------------------------
# LIBRARY
#----------------------------------------------------------------------------------------------------------------
import streamlit as st
import pandas as pd
import geopandas as gpd
import plotly.express as px
import json
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import fitz # PyMuPDF
import pytesseract
import cv2
from PIL import Image
import io
import numpy as np
from pdf2image import convert_from_path, convert_from_bytes
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
import tempfile
# import os
# from transformers import GenerationConfig
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract' # Adjust the path to your Tesseract installation
st.set_page_config(layout="wide")
# Load Hugging Face GPT model and tokenizer
model_name = "facebook/opt-1.3b" # or use a different model like "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
#----------------------------------------------------------------------------------------------------------------
# FUNCTION
#----------------------------------------------------------------------------------------------------------------
@st.cache_data
def lambeth_resi_map():
df = gpd.read_file('./data/Housing_Estates.geojson')
st.write(df)
shapefile_path = './data/Housing_Estates.shp'
gdf = gpd.read_file(shapefile_path)
geojson_data = gdf.to_json()
geojson_dict = json.loads(geojson_data)
# Create a Plotly map using the GeoJSON data
fig = px.choropleth_mapbox(
gdf,
geojson=geojson_dict,
locations=gdf.index,
#color="ES", # Replace with the column you want to use for coloring
mapbox_style="carto-positron",
center={"lat": gdf.geometry.centroid.y.mean(), "lon": gdf.geometry.centroid.x.mean()},
zoom=10,
opacity=0.5
)
st.plotly_chart(fig, use_container_width=True)
def extract_text_from_pdf(pdf_file):
text = ""
try:
# Save the uploaded PDF to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
tmp_file.write(pdf_file.read())
tmp_file_path = tmp_file.name
# Convert PDF to images
images = convert_from_path(tmp_file_path)
for image in images:
text += pytesseract.image_to_string(image)
# Remove the temporary file
os.remove(tmp_file_path)
except Exception as e:
st.error(f"Error reading PDF file: {e}")
return text
def extract_text_from_image(image_file):
try:
image = Image.open(image_file)
text = pytesseract.image_to_string(image)
except Exception as e:
st.error(f"Error reading image file: {e}")
text = ""
return text
# def extract_entities(text):
# model_name = "bert-base-uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name)
# input_text=text+" Act as an expert in building, real estate, construction industry, what is the type of this document"
# encoded_text = tokenizer(input_text, truncation=True, padding=True,return_tensors="pt")
# predictions = model(encoded_text)
# predicted_label = predictions[0][0]
# return predicted_label
def analyze_document(text):
prompt = (
"You are an expert in the building, real estate, and construction industry. "
"Analyze the following document to identify its type and extract key-value pairs. "
"The document can be one of the following types: utility bills, drawings, maintenance records, quality assurance reports, contracts, inspection reports, or other types of documents related to the building industry. "
"Based on the text extracted from the submitted file, identify the type of document and provide key-value pairs if applicable.\n\n"
"Document Text:\n"
f"{text}\n\n"
"Please provide your analysis in the following format:\n"
"1. Document Type: [Type of the document]\n"
"2. Key-Value Pairs (if any):\n"
" - Key: Value\n"
" - Key: Value\n"
" - Key: Value\n"
)
inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=2048, truncation=True)
outputs = model.generate(inputs, max_length=2048, num_return_sequences=1, no_repeat_ngram_size=2)
analysis = tokenizer.decode(outputs[0], skip_special_tokens=True)
return analysis
#----------------------------------------------------------------------------------------------------------------
# VARIABLES
#----------------------------------------------------------------------------------------------------------------
#----------------------------------------------------------------------------------------------------------------
# MAIN
#----------------------------------------------------------------------------------------------------------------
st.title('Building Intel Buddy')
st.write('Welcome to the page, this is a unified platform we collective submit our building data for future measurement')
lambeth_resi_map()
#----------------------------------------------------------------------------------------------------------------
# SIDE BAR
#----------------------------------------------------------------------------------------------------------------
user = st.sidebar.radio('Who are you?', options=['Owner of Unit', 'Owner of Building', 'Contractor', 'Developer'])
st.sidebar.divider()
st.sidebar.subheader('Building Thread Submission')
submit = st.sidebar.file_uploader('Upload your file', label_visibility='hidden')
if submit:
file_type = submit.type
if file_type == 'application/pdf':
st.write('Extracting text from PDF...')
col1, col2 = st.columns(2)
text, images = extract_text_from_pdf(submit)
for image in images:
col1.image(image)
entities = analyze_document(text)
col1.text_area('Extracted Text', entities, height=1000)
elif file_type.startswith('image/'):
st.write('Extracting text from Image...')
text = extract_text_from_image(submit)
col1, col2 = st.columns(2)
col1.image(submit)
# Extract and display entities
entities = analyze_document(text)
col2.text_area('Extracted Text', entities, height=1000)
else:
st.error('Unsupported file type. Please upload a PDF or image file.')