forked from olovholm/NIME
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfextractor_oldmethod.py
89 lines (70 loc) · 3.2 KB
/
pdfextractor_oldmethod.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/python
#-*- coding: utf-8
#
# This script runs through the pdf files in the nime2012 folder and tries to extact text from the pdf files. Text is written to screen.
# Should implement a limitation on how much of the files which are printed. Should also look for formatting.
#
# Alternative python pdf libraries: pyPdf, PDFMiner
# Tip from stackoverflow: https://code.google.com/p/pdfssa4met/
import os
import re
from pyPdf import PdfFileReader
import subprocess
abstract_word = "abstract"
introduction_word = {"intro", "introduction"}
SEARCHSTRING = re.compile('(abstract)(.*)(keywords)(.*)(INTRODUCTION)', flags=re.MULTILINE | re.IGNORECASE)
dir_path = "nime_archive/web/"
found = {}
found_nothing = {}
class PdfDoc:
#Instanciates the PdfDoc object. Reads the file and sets various variables
def __init__(self, pdf):
self.pdf = pdf
try:
meta = pdf.getDocumentInfo()
producer = meta['/Producer']
print "filename: %s \t \t producer = %s " % (infile, meta['/Producer'])
except Exception as e:
print "error extracting DocumentInfo from file:%s, %s" % (workfile ,e)
def search():
if found:
return True
else
return False
for folder in os.listdir(dir_path):
found[folder] = 0
found_nothing[folder] = 0
if os.path.isdir(dir_path+folder) == True:
for infile in os.listdir(dir_path+folder):
#print infile
if infile.endswith(".pdf"):
workfile = dir_path + folder + "/"+ infile
try:
input1 = PdfFileReader(file(workfile, "rb"))
except Exception as e:
print "Could not open file: %s, %s" % (workfile,e)
break
try:
pagenum = (input1.getNumPages())
for i in range(0,pagenum):
try:
#print "!!--- PAGE START: %i ---!!" % i
fp = input1.getPage(i)
text = fp.extractText()
splits = re.findall(SEARCHSTRING, text)
if len(splits) > 0:
# print "Found %s " % splits
print "Found %s" % infile
found[folder] = found[folder] + 1
print found[folder]
else:
if i == 0:
print "Found nothing: %s" % infile
found_nothing[folder] = found_nothing[folder] + 1
print found_nothing[folder]
except Exception as e:
print "error extracting text: %s \t as %s" %(workfile ,e)
except Exception as e:
print "Error extracting pagenumber from pdf file: %s with error: %s" % (workfile, e)
for folder in os.listdir(dir_path):
print "Number completed: %i, not completed: %i, percentage extracted: %i for %s" % (int(found[folder]), int(found_nothing[folder]), int(found[folder])/(int(found[folder])+int(found_nothing[folder]), folder))