-
Notifications
You must be signed in to change notification settings - Fork 0
/
Summarizing-Academic-Export-to-Excel Independently
113 lines (92 loc) · 4 KB
/
Summarizing-Academic-Export-to-Excel Independently
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import glob
import time
import sys
from transformers import pipeline
import tqdm
from datetime import datetime
import openpyxl
from openpyxl.styles import Alignment, Font, Border, Side, PatternFill
import nltk
nltk.download("punkt")
def progress_bar(current, total, message):
bar_length = 50
progress = current / total
arrow = "=" * int(bar_length * progress) + ">"
spaces = " " * (bar_length - len(arrow))
sys.stdout.write(f"\r{message}: [{arrow + spaces}] {int(progress * 100)}%")
sys.stdout.flush()
summarizer = pipeline("summarization", model="lidiya/bart-large-xsum-samsum")
text_files = glob.glob("*.txt")
for text_file in text_files:
author, title, year, journal = text_file[:-4].split("-")
summary_file = f"summary_of_{author}-{title}-{year}-{journal}.xlsx"
if os.path.exists(summary_file):
print(f"Summary file for {text_file} already exists. Skipping summarization.")
continue
workbook = openpyxl.Workbook()
sheet = workbook.active
header_font = Font(bold=True)
header_fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid")
header_border = Border(left=Side(style="thin"),
right=Side(style="thin"),
top=Side(style="thin"),
bottom=Side(style="thin"))
headers = ["ID", "Author", "Title", "Year", "Journal", "Summary ID", "Summary", "Date"]
for col_num, value in enumerate(headers, 1):
cell = sheet.cell(row=1, column=col_num)
cell.value = value
cell.font = header_font
cell.fill = header_fill
cell.border = header_border
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
with open(text_file, "r", encoding="utf-8") as f:
content = f.read()
chunk_size = 1024
sentences = nltk.sent_tokenize(content)
chunks = []
chunk = []
chunk_length = 0
for sentence in sentences:
sentence_length = len(sentence)
if chunk_length + sentence_length <= chunk_size:
chunk.append(sentence)
chunk_length += sentence_length
else:
chunks.append(" ".join(chunk))
chunk = [sentence]
chunk_length = sentence_length
if chunk:
chunks.append(" ".join(chunk))
summaries = []
for i, chunk in enumerate(chunks):
for j in tqdm.tqdm(range(100)):
time.sleep(0.02)
progress_bar(j + 1, 100, f"Summarizing chunk {i+1}/{len(chunks)} of {text_file}")
summary = summarizer(chunk)
summaries.append(summary[0]["summary_text"])
current_date = datetime.today().strftime('%Y-%m-%d')
file_id = "1"
for i, summary in enumerate(summaries):
summary_id = f"{file_id}-{i + 1}"
new_row = [file_id, author, title, year, journal, summary_id, summary, current_date]
sheet.append(new_row)
row_number = sheet.max_row
for col_number, cell in enumerate(new_row, 1):
sheet_cell = sheet.cell(row=row_number, column=col_number)
sheet_cell.border = Border(left=Side(style="thin"),
right=Side(style="thin"),
top=Side(style="thin"),
bottom=Side(style="thin"))
sheet_cell.alignment = Alignment(vertical="top", wrap_text=True)
if col_number == 7: # Summary column
sheet_cell.font = Font(size=11)
sheet.row_dimensions[row_number].height = 80 # Adjust row height
print(f"Summary of {text_file} added to the Excel file.")
# Adjust the width of the "Summary" column
sheet.column_dimensions["G"].width = 80
# Freeze the first row
sheet.freeze_panes = "A2"
# Save the workbook with a unique name based on the text file
excel_file = summary_file
workbook.save(excel_file)