OCR 竟然还要收费

Posted on Sat 17 December 2022 in Journal

Abstract Journal on 2022-12-17
Authors Walter Fan
 Category    learning note  
Status v1.0
Updated 2022-12-17
License CC-BY-NC-ND 4.0

我曾经是 evernote(印象笔记)的重度用户,现在用的少了,主要原因是我建了自己的网站,有了自己的 wordpress, 自己用 moinmoin 搭的 wiki, 没有什么需求非要有一个印象笔记,尤其还要收不少钱,连 OCR 还得收钱,也有次数限制,这对一个程序员来说这钱花得不值

当然,我常有需求要把图片上的文本转成文字的需求,例如在网上浏览的 ppt,pdf,png时,发现有不错的内容我会写个小程序来提取图片中的文本, 特别是其中的超文本链接。

1) 先安装相关的依赖

sudo apt install tesseract-ocr
sudo apt install libtesseract-dev
pip install pytesseract

2) 相关的 Python 代码很简单

#!/usr/bin/env python3

from PIL import Image
import pytesseract
import cv2
import re
import os
import sys

URL_PATTERN = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

def extract_text(image_file):
    img=cv2.imread(image_file)

    gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)

    gray_image="{}_gray_{}.png".format(image_file, os.getpid())

    cv2.imwrite(gray_image, gray)
    print("begin to extract urls from {}".format(gray_image))
    text=pytesseract.image_to_string(Image.open(gray_image))
    os.remove(gray_image)

    return text

def extract_urls(image_file):
    text = extract_text(image_file)

    urls = re.findall(URL_PATTERN, text)
    links = []
    for url in urls:
        #link = '<a href="{}" target="_blank">{}</a>'.format(url[0], url[0])
        print(url)
        links.append("".join(url))
    return links


if __name__ == '__main__':
    image_file  = "test.png"
    if len(sys.argv) > 1:
        image_file = sys.argv[1]

    extract_urls(image_file)

3) Web 版本的 OCR

基于这个小程序,我又做了一些扩展,通过 Web 页面来抽取文本和超链接

web gui

  • views.py
from flask import render_template, redirect, url_for, flash, request, current_app
from portal.test import test_module
from portal.test.forms import OcrForm
from portal.test.ocr import *

from portal import logger
from portal import db
import os


dir_path = os.path.dirname(os.path.realpath(__file__))

ALLOWED_EXTENSIONS = set(['gif', 'jpg', 'png', 'bmp'])

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS


@test_module.route('/test', methods=['GET', 'POST'])
def index():
    test_form = OcrForm()
    links = []
    if test_form.validate_on_submit():
        logger.info("submit: {}".format(test_form.ocr_command.data))

        image_file=dir_path + "/../static/image/reference.png"
        if test_form.ocr_command.data == 1:
            links = extract_urls(image_file)
            test_form.output_content.data = "\n".join(links)
        elif test_form.ocr_command.data == 2:
            text = extract_text(image_file)
            test_form.output_content.data = text
    else:
        if test_form.is_submitted():
            logger.error(test_form.errors)

    return render_template('test.html', form=test_form, links=links)
  • forms.py
from flask_wtf import FlaskForm
from flask_wtf.file import FileAllowed
from wtforms import StringField, SubmitField, TextAreaField, PasswordField
from wtforms import BooleanField, SelectField, FileField
from wtforms import HiddenField
from wtforms.validators import DataRequired, Length, Optional
import datetime



class OcrForm(FlaskForm):

    input_content = TextAreaField('input', validators=[Optional()],
                                   render_kw={
                                       "class": "form-control",
                                       "rows": 5})
    output_content = TextAreaField('output', validators=[Optional()],
                                   render_kw={
                                       "class": "form-control",
                                       "rows": 5,
                                       "cols": 60})

    ocr_command = SelectField('test_command',
                              choices=[(1, 'Extract URLs'),
                                        (2, 'Extract Text')
                                        ],
                              render_kw={"class": "form-control"},
                              coerce=int)

    ocr_params = StringField('test_params', validators=[Optional()],
                             render_kw={"class": "form-control"})

    submit_button = SubmitField('Submit', render_kw={"class": "btn btn-primary"})
  • ocr.py
#!/usr/bin/env python3

from PIL import Image
import pytesseract
import cv2
import re
import os
import sys



URL_PATTERN = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"


def extract_text(image_file):
    img=cv2.imread(image_file)

    gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)

    gray_image="{}_gray_{}.png".format(image_file, os.getpid())

    cv2.imwrite(gray_image, gray)
    print("begin to extract urls from {}".format(gray_image))
    text=pytesseract.image_to_string(Image.open(gray_image))
    os.remove(gray_image)

    return text

def extract_urls(image_file):
    text = extract_text(image_file)

    urls = re.findall(URL_PATTERN, text)
    links = []
    for url in urls:
        #link = '<a href="{}" target="_blank">{}</a>'.format(url[0], url[0])
        print(url)
        links.append("".join(url))
    return links


if __name__ == '__main__':
    image_file  = "test.png"
    if len(sys.argv) > 1:
        image_file = sys.argv[1]

    extract_urls(image_file)

完整代码参见 https://github.com/walterfan/webocr/tree/master

参考资料

  • OCR 开源项目 https://github.com/tesseract-ocr/tesseract
  • https://nanonets.com/blog/ocr-with-tesseract/


本作品采用知识共享署名-非商业性使用-禁止演绎 4.0 国际许可协议进行许可。