OCR 竟然还要收费
Posted on Sat 17 December 2022 in Journal
Abstract | Journal on 2022-12-17 |
---|---|
Authors | Walter Fan |
Category | learning note |
Status | v1.0 |
Updated | 2022-12-17 |
License | CC-BY-NC-ND 4.0 |
我曾经是 evernote(印象笔记)的重度用户,现在用的少了,主要原因是我建了自己的网站,有了自己的 wordpress, 自己用 moinmoin 搭的 wiki, 没有什么需求非要有一个印象笔记,尤其还要收不少钱,连 OCR 还得收钱,也有次数限制,这对一个程序员来说这钱花得不值
当然,我常有需求要把图片上的文本转成文字的需求,例如在网上浏览的 ppt,pdf,png时,发现有不错的内容我会写个小程序来提取图片中的文本, 特别是其中的超文本链接。
1) 先安装相关的依赖
sudo apt install tesseract-ocr
sudo apt install libtesseract-dev
pip install pytesseract
2) 相关的 Python 代码很简单
#!/usr/bin/env python3
from PIL import Image
import pytesseract
import cv2
import re
import os
import sys
URL_PATTERN = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
def extract_text(image_file):
img=cv2.imread(image_file)
gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
gray_image="{}_gray_{}.png".format(image_file, os.getpid())
cv2.imwrite(gray_image, gray)
print("begin to extract urls from {}".format(gray_image))
text=pytesseract.image_to_string(Image.open(gray_image))
os.remove(gray_image)
return text
def extract_urls(image_file):
text = extract_text(image_file)
urls = re.findall(URL_PATTERN, text)
links = []
for url in urls:
#link = '<a href="{}" target="_blank">{}</a>'.format(url[0], url[0])
print(url)
links.append("".join(url))
return links
if __name__ == '__main__':
image_file = "test.png"
if len(sys.argv) > 1:
image_file = sys.argv[1]
extract_urls(image_file)
3) Web 版本的 OCR
基于这个小程序,我又做了一些扩展,通过 Web 页面来抽取文本和超链接
- views.py
from flask import render_template, redirect, url_for, flash, request, current_app
from portal.test import test_module
from portal.test.forms import OcrForm
from portal.test.ocr import *
from portal import logger
from portal import db
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
ALLOWED_EXTENSIONS = set(['gif', 'jpg', 'png', 'bmp'])
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@test_module.route('/test', methods=['GET', 'POST'])
def index():
test_form = OcrForm()
links = []
if test_form.validate_on_submit():
logger.info("submit: {}".format(test_form.ocr_command.data))
image_file=dir_path + "/../static/image/reference.png"
if test_form.ocr_command.data == 1:
links = extract_urls(image_file)
test_form.output_content.data = "\n".join(links)
elif test_form.ocr_command.data == 2:
text = extract_text(image_file)
test_form.output_content.data = text
else:
if test_form.is_submitted():
logger.error(test_form.errors)
return render_template('test.html', form=test_form, links=links)
- forms.py
from flask_wtf import FlaskForm
from flask_wtf.file import FileAllowed
from wtforms import StringField, SubmitField, TextAreaField, PasswordField
from wtforms import BooleanField, SelectField, FileField
from wtforms import HiddenField
from wtforms.validators import DataRequired, Length, Optional
import datetime
class OcrForm(FlaskForm):
input_content = TextAreaField('input', validators=[Optional()],
render_kw={
"class": "form-control",
"rows": 5})
output_content = TextAreaField('output', validators=[Optional()],
render_kw={
"class": "form-control",
"rows": 5,
"cols": 60})
ocr_command = SelectField('test_command',
choices=[(1, 'Extract URLs'),
(2, 'Extract Text')
],
render_kw={"class": "form-control"},
coerce=int)
ocr_params = StringField('test_params', validators=[Optional()],
render_kw={"class": "form-control"})
submit_button = SubmitField('Submit', render_kw={"class": "btn btn-primary"})
- ocr.py
#!/usr/bin/env python3
from PIL import Image
import pytesseract
import cv2
import re
import os
import sys
URL_PATTERN = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
def extract_text(image_file):
img=cv2.imread(image_file)
gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
gray_image="{}_gray_{}.png".format(image_file, os.getpid())
cv2.imwrite(gray_image, gray)
print("begin to extract urls from {}".format(gray_image))
text=pytesseract.image_to_string(Image.open(gray_image))
os.remove(gray_image)
return text
def extract_urls(image_file):
text = extract_text(image_file)
urls = re.findall(URL_PATTERN, text)
links = []
for url in urls:
#link = '<a href="{}" target="_blank">{}</a>'.format(url[0], url[0])
print(url)
links.append("".join(url))
return links
if __name__ == '__main__':
image_file = "test.png"
if len(sys.argv) > 1:
image_file = sys.argv[1]
extract_urls(image_file)
完整代码参见 https://github.com/walterfan/webocr/tree/master
参考资料
- OCR 开源项目 https://github.com/tesseract-ocr/tesseract
- https://nanonets.com/blog/ocr-with-tesseract/
本作品采用知识共享署名-非商业性使用-禁止演绎 4.0 国际许可协议进行许可。