"""OCR in Python using the Tesseract engine from Google
http://code.google.com/p/pytesser/
by Michael J.T. O'Kelly
V 0.0.1, 3/10/07"""
import Image
import subprocess
import util
import errors
tesseract_exe_name = 'tesseract' # Name of executable to be called at command line
scratch_image_name = "temp.bmp" # This file must be .bmp or other Tesseract-compatible format
scratch_text_name_root = "temp" # Leave out the .txt extension
cleanup_scratch_flag = True # Temporary files cleaned up after OCR operation
def call_tesseract(input_filename, output_filename):
"""Calls external tesseract.exe on input file (restrictions on types),
outputting output_filename+'txt'"""
args = [tesseract_exe_name, input_filename, output_filename]
proc = subprocess.Popen(args)
retcode = proc.wait()
if retcode!=0:
errors.check_for_errors()
def image_to_string(im, cleanup = cleanup_scratch_flag):
"""Converts im to file, applies tesseract, and fetches resulting text.
If cleanup=True, delete scratch files after operation."""
try:
util.image_to_scratch(im, scratch_image_name)
call_tesseract(scratch_image_name, scratch_text_name_root)
text = util.retrieve_text(scratch_text_name_root)
finally:
if cleanup:
util.perform_cleanup(scratch_image_name, scratch_text_name_root)
return text
def image_file_to_string(filename, cleanup = cleanup_scratch_flag, graceful_errors=True):
"""Applies tesseract to filename; or, if image is incompatible and graceful_errors=True,
converts to compatible format and then applies tesseract. Fetches resulting text.
If cleanup=True, delete scratch files after operation."""
try:
try:
call_tesseract(filename, scratch_text_name_root)
text = util.retrieve_text(scratch_text_name_root)
except errors.Tesser_General_Exception:
if graceful_errors:
im = Image.open(filename)
text = image_to_string(im, cleanup)
else:
raise
finally:
if cleanup:
util.perform_cleanup(scratch_image_name, scratch_text_name_root)
return text
if __name__=='__main__':
im = Image.open('phototest.tif')
text = image_to_string(im)
print text
try:
text = image_file_to_string('fnord.tif', graceful_errors=False)
except errors.Tesser_General_Exception, value:
print "fnord.tif is incompatible filetype. Try graceful_errors=True"
print value
text = image_file_to_string('fnord.tif', graceful_errors=True)
print "fnord.tif contents:", text
text = image_file_to_string('fonts_test.png', graceful_errors=True)
print text
pytesser_v0.0.1.zip
4星 · 超过85%的资源 需积分: 24 162 浏览量
2015-11-02
20:35:32
上传
评论 3
收藏 2.16MB ZIP 举报
Evankaka
- 粉丝: 7701
- 资源: 129
最新资源
- MyBatis进阶技巧:探索动态SQL的无限可能.md
- HM2300C-VB一款N-Channel沟道SOT23的MOSFET晶体管参数介绍与应用说明
- HM2300B-VB一款N-Channel沟道SOT23的MOSFET晶体管参数介绍与应用说明
- 员工解除劳动合同申请表.pdf
- 物模块模型代码,前往设计物模块所属
- mybatis动态sql(使用<where>标签来处理多个查询条件)
- Java面试手册,助力大家面试过五关斩六将,面试成功
- HITK0303MP-VB一款P-Channel沟道SOT23的MOSFET晶体管参数介绍与应用说明
- mybatis动态sql之xml增删改查批量操作示例EmpMapper.xml
- C/C++内存检测工具Sanitizers
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
评论20