Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OCR fix #6

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 120 additions & 64 deletions LocateText.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,79 +1,135 @@
#!/usr/bin/python
# vim: set ts=2 expandtab:
"""
Module: LocateText
Desc:
Author: John O'Neil
Email: [email protected]
DATE: Saturday, Sept 14th 2013

Setment raw manga scan and output image
with text areas outlined in red.

"""
#import clean_page as clean

import image_io as imgio
import connected_components as cc
import run_length_smoothing as rls
import clean_page as clean
import ocr
import segmentation as seg
import furigana
import arg
import defaults
from imageio import imwrite

import numpy as np
import cv2
import sys

import arg
import argparse
import sys
import os
import scipy.ndimage
import datetime

import time

if __name__ == '__main__':
def process_image(infile, outfile, path, anno_dir_exists=False):

t0 = time.perf_counter()

# get correct output paths
path = imgio.normalize_path(path)
img_out, txt_out = imgio.get_output_directory(path, infile, path, os.path.splitext(infile)[0] + '.txt', outfile)
html_path = os.path.split(img_out)[0]

img = cv2.imread(path+infile)
img_copy = img.copy()
gray = clean.grayscale(img)

binary_threshold=arg.integer_value('binary_threshold',default_value=defaults.BINARY_THRESHOLD)
if arg.boolean_value('verbose'):
print('Binarizing with threshold value of ' + str(binary_threshold))
inv_binary = cv2.bitwise_not(clean.binarize(gray, threshold=binary_threshold))

# python MangaOCR.py -p <inpath> -o <outpath>
segmented_image = seg.segment_image(gray)
segmented_image = segmented_image[:,:,2]

components = cc.get_connected_components(segmented_image)

if arg.boolean_value('debug'):
cc.draw_bounding_boxes(img,components,color=(255,0,0),line_size=2)
imgio.save_image(img, img_out)
else:
cc.draw_bounding_boxes(img,components,color=(255,0,0),line_size=2)
imgio.save_image(img, img_out)

if arg.boolean_value('verbose'):
t1 = time.perf_counter()
print(f"Image processing done in {t1} seconds")

proc_start_time = datetime.datetime.now()

parser = arg.parser
parser = argparse.ArgumentParser(description='Generate HTML annotation for raw manga scan with detected OCR\'d text.')
parser.add_argument('infile', help='Input (color) raw Manga scan image to annoate.')
parser.add_argument('-o','--output', dest='outfile', help='Output html file.')
parser.add_argument('-v','--verbose', help='Verbose operation. Print status messages during processing', action="store_true")
parser.add_argument('--display', help='Display output using OPENCV api and block program exit.', action="store_true")
parser.add_argument('--furigana', help='Attempt to suppress furigana characters which interfere with OCR.', action="store_true")
#parser.add_argument('-d','--debug', help='Overlay input image into output.', action="store_true")
parser.add_argument('--sigma', help='Std Dev of gaussian preprocesing filter.',type=float,default=None)
parser.add_argument('--binary_threshold', help='Binarization threshold value from 0 to 255.',type=int,default=defaults.BINARY_THRESHOLD)
#parser.add_argument('--segment_threshold', help='Threshold for nonzero pixels to separete vert/horiz text lines.',type=int,default=1)
parser.add_argument('--additional_filtering', help='Attempt to filter false text positives by histogram processing.', action="store_true")
arg.value = parser.parse_args()

infile = arg.string_value('infile')
outfile = arg.string_value('outfile',default_value=infile + '.text_areas.png')

if not os.path.isfile(infile):
print('Please provide a regular existing input file. Use -h option for help.')
sys.exit(-1)
img = cv2.imread(infile)
gray = clean.grayscale(img)

binary_threshold=arg.integer_value('binary_threshold',default_value=defaults.BINARY_THRESHOLD)
if arg.boolean_value('verbose'):
print('Binarizing with threshold value of ' + str(binary_threshold))
inv_binary = cv2.bitwise_not(clean.binarize(gray, threshold=binary_threshold))
binary = clean.binarize(gray, threshold=binary_threshold)

segmented_image = seg.segment_image(gray)
segmented_image = segmented_image[:,:,2]
components = cc.get_connected_components(segmented_image)
cc.draw_bounding_boxes(img,components,color=(255,0,0),line_size=2)

imwrite(outfile, img)

if arg.boolean_value('display'):
cv2.imshow('segmented_image',segmented_image)

if cv2.waitKey(0) == 27:
cv2.destroyAllWindows()
cv2.destroyAllWindows()
if (not arg.boolean_value('html')): return

texts = ocr.ocr_on_bounding_boxes(img_copy, components)

if arg.boolean_value('verbose'):
t2 = time.perf_counter()
print(f"OCR done in {t2} seconds")

if (arg.boolean_value('debug')):
imgio.save_text(texts, txt_out)

html_out = imgio.generate_html(os.path.split(img_out)[1], components, texts)
if (not anno_dir_exists):
imgio.copytree_('./annotorious/', imgio.normalize_path(html_path)+'annotorious/')
anno_dir_exists = True

imgio.save_webpage(html_out, os.path.splitext(img_out)[0]+'.html')

return anno_dir_exists

def main():

# Working commands
# python MangaOCR_dev.py -i <infile> -o <outfile>
# python MangaOCR_dev.py -p <inpath> -o <outpath>
# python MangaOCR_dev.py -p <inpath> --default_directory

parser = arg.parser
parser = argparse.ArgumentParser(description='Generate text file containing OCR\'d text.')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('-i', '--infile', help='Input image to annotate.', dest='infile', type=str)
group.add_argument('-p', '--inpath', help='Path to directory containing image(s)', dest='inpath', type=str)
parser.add_argument('-o','--output', help='Output file or filepath.', dest='outfile')
parser.add_argument('-s','--scheme', help='Output naming scheme. Appended to input filename', dest='scheme', type=str, default='_text')
parser.add_argument('-v','--verbose', help='Verbose operation. Print status messages during processing', action="store_true")
# parser.add_argument('-d','--debug', help='Overlay input image into output.', action="store_true")
parser.add_argument('--html', help='Display output in an html file.', action="store_true")
parser.add_argument('--furigana', help='Attempt to suppress furigana characters which interfere with OCR.', action="store_true")
parser.add_argument('--sigma', help='Std Dev of gaussian preprocesing filter.',type=float,default=None)
parser.add_argument('--binary_threshold', help='Binarization threshold value from 0 to 255.',type=int,default=defaults.BINARY_THRESHOLD)
parser.add_argument('--additional_filtering', help='Attempt to filter false text positives by histogram processing.', action="store_true")
parser.add_argument('--default_directory', help='Store output in predefined folders.', action="store_true")
arg.value = parser.parse_args()

infile = arg.string_value('infile')
inpath = arg.string_value('inpath')
scheme = arg.string_value('scheme')
outfile = arg.string_value('outfile')

if os.path.isfile(infile):
if arg.boolean_value('verbose'):
print('File exists. Performing OCR . . .')
path_, infile_ = os.path.split(infile)
process_image(infile_, outfile, path_)
sys.exit(-1)

infiles = os.listdir(inpath)
anno_dir_exists = False
if infiles:
if arg.boolean_value('verbose'):
print('Non-empty directory. Attempting to perform ocr on all files . . .')
for infile_ in infiles:
try:
outfile_ = outfile
anno_dir_exists = process_image(infile_, outfile_, inpath, anno_dir_exists)
except AttributeError as e:
if arg.boolean_value('verbose'):
print('Input file \"', infile_, '\" is not an image', sep='')
sys.exit(-1)

# More error handling
if not (os.path.isfile(infile) or inpath):
print('Please provide a regular existing input file. Use -h option for help.')
sys.exit(-1)
if not (infiles):
print('Directory is empty. Place images on the desired folder. Use -h option for help.')
sys.exit(-1)

if __name__ == '__main__':
main()
74 changes: 30 additions & 44 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,21 @@ Overview
--------
This repository holds some experiments I did in summer 2013 during a sudden interest in text detection in images. It uses some standard techniques (run length smoothing, connected component analysis) and some experimental stuff. Overall, I was able to get in the neighborhood of where I wanted to be, but the results are very processing intensive and not terribly reliable.

Usage
-----
Provide an input file or path using the arguments `-i` and `-p`, respectively. When an input path is provided, all files in the directory are processed. The user may provide a corresponding output file or path using the `-o` argument. The output is saved in the default file/directory if no output argument is defined.
```
python LocateText.py -i <infile> -o <outfile>
python LocateText.py -p <inpath> -o <outpath>
python LocateText.py -p <inpath> --default_directory
```
Other useful arguments:
```
--html : Display output in an html file.
--default-directory : Store output in predefined folders. This is recommended for processing images in a directory since the input may be overwritten.
--additional_filtering : Attempt to filter false text positives by histogram processing.
```

State
-----
I haven't bothered to form this into a python library. It's just a series of scripts each trying out various things, such as:
Expand All @@ -19,17 +34,17 @@ I haven't bothered to form this into a python library. It's just a series of scr
Text Location Example
---------------------
Here's an example run of a page from Weekly Young Magazine #31 2013. The input image is as follows (jpg).
![Input image](https://github.com/johnoneil/MangaTextDetection/blob/master/test/194.jpg?raw=true)
![Input image](doc/194.jpg)

An initial estimate of text locations can be found by the 'LocateText.py' script:
An initial estimate of text locations can be found by the `LocateText.py` script:

```
../LocateText.py '週刊ヤングマガジン31号194.jpg' -o 194_text_locations.png
python LocateText.py -i 194.jpg
```

With the results as follows (estimated text marked with red boxes):

![locate text output](https://github.com/johnoneil/MangaTextDetection/blob/master/test/194_text_locations_thumb.png?raw=true)
![Locate text output](doc/194_text_areas_no_filter.jpg)

Note that in the output above you see several of the implementation deficiencies. For example, there are several small false positives scattered around, and some major false positives on the girl's sleeve and eyes in panels 2 and 3.
Also note that many large areas of text were not detected (false negatives). Despite how pleased I was with the results (and I was more pleased than you could possibly believe) significant improvements are needed.
Expand All @@ -39,63 +54,33 @@ Text Segmentation Example
To more easily separate text from background you can also segment the image, with text areas and non text being separated into different (RGB) color channels. This easily allows you to remove estimated text from image entirely or vice-versa.
Use the command:
```
./segmentation.py '週刊ヤングマガジン31号194.jpg' -o 194_segmentation.png
python segmentation.py 194.jpg -o 194_segmentation.png
```
The results follow:

![Input image](https://github.com/johnoneil/MangaTextDetection/blob/master/test/194_segmentation_thumb.png?raw=true)
![Segmentation image](doc/194_segmentation.png)

OCR and Html Generation
-----------------------
I did take the time to run simple OCR on some of the located text regions, with mixed results. I used the python tesseract package (pytesser) but found the results were not generally good for vertical text, among other issues.
The script ocr.py should run ocr on detected text regions, and output the results to the command line.
```
../ocr.py '週刊ヤングマガジン31号194.jpg'
Test blob assigned to no row on pass 2
Test blob assigned to no row on pass 3
0,0 1294x2020 71% :ぅん'・ 結局
玉子かけご飯が
一 番ぉぃしぃと
I did take the time to run simple OCR on some of the located text regions. This uses the python wrapper for Google's OCR engine Tesseract (pytesserract). Unlike the original version of the `ocr.py` script, text is assumed to be vertically aligned.

H」
I also embedded those results in an HTML output, allowing "readers" to hover on Japanese Text, revealing the OCR output, which can be edited/copied/pasted. This is by using the optional argument `--html`. Some examples can be seen below:

はフィクショ穴ぁり、 登場する人物
![OCR output success](doc/194_html_ocr_success.png)
![OCR output success](doc/194_html_ocr_failure.png)

※この物語

```
You can see some fragmented positives, but in all the results for this page are abysmal.

I also embedded those results in an HTML output, allowing "readers" to hover on Japanese Text, revealing the OCR output, which can be edited/copied/pasted. This is via the script MangaDetectText. A (more successful) example of this can be seen below:

![locate text output](https://github.com/johnoneil/MangaTextDetection/blob/master/test/example.png?raw=true)
Even after using `jpn_vert` trained data, results are still unreliable. Most of the bounding boxes with larger width return successful results. Error mostly happens when the bounding box only has one line of vertically-aligned text.

Dependencies
-----------------------
You should be able to install most of the dependencies via pip, or you could use your operating systems package manager (e.g. Mac OS X http://brew.sh/)

### Python 2.7+
### Python 3+

https://www.python.org/

Install as per OS instructions.

### Pip

http://pip.readthedocs.org/en/latest/index.html

Install as per OS instructions.

### Numpy

http://www.numpy.org/
Expand Down Expand Up @@ -141,8 +126,9 @@ Install as per OS instructions, this should also include the python bindings.
https://code.google.com/p/tesseract-ocr/

Install as per OS instructions, then use pip to install the python bindings.
Don't forget to include your target language's trained data sets.
Don't forget to include your target language's trained data sets and set the `tesseract_cmd` PATH. (See `ocr.py` script)

```
pip install python-tesseract
Edit line 7 of ocr.py script:
pytesseract.pytesseract.tesseract_cmd = <path>
```
Loading