从头构建PaddleOCR的docker镜像

有需求需要内网部署ocr服务，多方对比感觉paddle百度飞桨各方面还是不错的。

https://www.paddlepaddle.org.cn/hubdetail?name=ch_pp-ocrv3&en_category=TextRecognition

我的需求场景是内网的容器部署，编译底包也是限定死的只能用centos（centos8）系统。

所以尝试从头打包paddleocr的镜像。

先贴配置文件：

Dockerfile

FROM centos:8.3.2011


RUN sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-*
RUN sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-*

RUN yum update -y
RUN yum install gcc make python3-devel openssl-devel mesa-libGL bzip2-devel libffi-devel zlib-devel xz-devel sqlite-devel unzip -y


COPY Python-3.9.6.tgz /Python-3.9.6.tgz
RUN tar -zxvf /Python-3.9.6.tgz
RUN cd /Python-3.9.6
RUN /Python-3.9.6/configure --enable-loadable-sqlite-extensions --enable-optimizations
RUN make altinstall

RUN pip3.9 install --upgrade pip

COPY PaddleOCR-release-2.7.zip /PaddleOCR-release-2.7.zip
RUN unzip /PaddleOCR-release-2.7.zip -d /
RUN mv /PaddleOCR-release-2.7 /PaddleOCR
# RUN mkdir /PaddleOCR/inference
# ADD ch_ppocr_mobile_v2.0_cls_infer.tar /PaddleOCR/inference/
# ADD ch_ppocr_mobile_v2.0_det_infer.tar /PaddleOCR/inference/
# ADD ch_ppocr_mobile_v2.0_rec_infer.tar /PaddleOCR/inference/
# RUN mv /PaddleOCR/inference/ch_ppocr_mobile_v2.0_det_infer /PaddleOCR/inference/ch_PP-OCRv3_det_infer
# RUN mv /PaddleOCR/inference/ch_ppocr_mobile_v2.0_rec_infer /PaddleOCR/inference/ch_PP-OCRv3_rec_infer

WORKDIR /PaddleOCR

RUN pip install backports.lzma
RUN pip install PyMuPDF==1.18.0


RUN pip install -r /PaddleOCR/requirements.txt
RUN pip install paddlepaddle
RUN pip install paddlehub

# RUN hub install deploy/hubserving/ocr_system/
# RUN hub install deploy/hubserving/ocr_cls/
# RUN hub install deploy/hubserving/ocr_det/
# RUN hub install deploy/hubserving/ocr_rec/

RUN mkdir -p /root/.paddlehub/modules
ADD ch_pp_ocrv3_1.2.0.zip /tmp/
ADD ch_pp_ocrv3_det_1_1_0.zip /tmp
RUN unzip /tmp/ch_pp_ocrv3_1.2.0.zip -d /root/.paddlehub/modules/
RUN unzip /tmp/ch_pp_ocrv3_det_1_1_0.zip -d /root/.paddlehub/modules/


RUN pip install waitress tornado
ADD html /PaddleOCR/html
ADD html/app_compat.py /usr/local/lib/python3.9/site-packages/paddlehub/serving/app_compat.py

RUN yum install epel-release -y
RUN yum install supervisor -y
ADD supervisord.conf /supervisord.conf


EXPOSE 9000

# CMD ["/bin/bash","-c","hub serving start --modules ocr_system ocr_cls ocr_det ocr_rec -p 9000"]
# CMD ["/bin/bash","-c","hub serving start --modules ch_pp-ocrv3 -p 9000"]
# CMD ["/bin/bash","-c","python3.9 html/main.py"]
CMD ["supervisord","-c","/supervisord.conf"]

supervisord.conf

[supervisord]
logfile=/tmp/supervisord.log ; (main log file;default $CWD/supervisord.log)
logfile_maxbytes=50MB ; (max main logfile bytes b4 rotation;default 50MB)
logfile_backups=10 ; (num of main logfile rotation backups;default 10)
loglevel=info ; (log level;default info; others: debug,warn,trace)
pidfile=/var/run/supervisord.pid ; (supervisord pidfile;default supervisord.pid)
nodaemon=false ; (start in foreground if true;default false)
minfds=1024 ; (min. avail startup file descriptors;default 1024)
minprocs=200 ; (min. avail process descriptors;default 200)

[program:paddlehub]
autorestart=True      ; 程序异常退出后自动重启
autostart=True        ; 在 supervisord 启动的时候也自动启动
process_name=paddlehub
command=python3.9 /PaddleOCR/html/main.py
directory=/PaddleOCR
user=admin

main.py

# encoding: utf-8
# author:alisen
# time: 2020/4/28 14:54
import os
import json
import base64
from tornado.options import define, options
import tornado.web
import tornado.gen
import tornado.template
import tornado.httpserver
import tornado.ioloop

from paddlehub.serving.model_service.base_model_service import cv_module_info
from paddlehub.serving.model_service.base_model_service import nlp_module_info
from paddlehub.serving.model_service.base_model_service import v2_module_info

from paddlehub.utils import log

import paddlehub as hub
import numpy as np
import cv2

ocr = hub.Module(name="ch_pp-ocrv3", enable_mkldnn=True)  

def package_result(status: str, msg: str, data: dict):
    '''
    Package message of response.

    Args:
        status(str): Error code
            ========   ==============================================================================================
            Code       Meaning
            --------   ----------------------------------------------------------------------------------------------
            '000'      Return results normally
            '101'      An error occurred in the predicting method
            '111'      Module is not available
            '112'      Use outdated and abandoned HTTP protocol format
            ========   ===============================================================================================
        msg(str): Detailed info for error
        data(dict): Result of predict api.

    Returns:
        dict: Message of response

    Examples:
        .. code-block:: python

            data = {'result': 0.002}
            package_result(status='000', msg='', data=data)
    '''
    return {"status": status, "msg": msg, "results": data}

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

class Index(tornado.web.RequestHandler):
    @tornado.gen.coroutine
    def get(self, *args, **kwargs):
        self.render('/PaddleOCR/html/index.html')
        

class Predict(tornado.web.RequestHandler):

    @tornado.gen.coroutine
    def post(self):
        '''
        Http api for predicting.

        Args:
            module_name(str): Module name for predicting.

        Returns:
            Result of predicting after packaging.
        '''
        self.set_header('content-type', 'application/json')

        img_b64 = json.loads(self.request.body)['images'][0]

        if img_b64 is None:
            message = "This usage is out of date, please use 'application/json' as content-type to post to /predict"
            self.set_status(400)
            self.finish(json.dumps(package_result("112", message, ""), cls=NpEncoder))
            return
        
        im_bytes = base64.b64decode(img_b64)
        im_arr = np.frombuffer(im_bytes, dtype=np.uint8)
        source = cv2.imdecode(im_arr, flags=cv2.IMREAD_COLOR)

        result = ocr.recognize_text(images=[source])

        self.set_status(200)
        self.finish(json.dumps(package_result("000", "", result), ensure_ascii=False))
        return

def make_app():
    # from backend.webInterface import tr_run
    return tornado.web.Application([
        (r"/predict/ch_pp-ocrv3", Predict),
        (r"/", Index)
    ])


if __name__ == "__main__":
    define("port", default=9000, type=int, help='指定运行时端口号')
    tornado.options.parse_command_line()
    port = options.port

    app = make_app()

    server = tornado.httpserver.HTTPServer(app)
    server.bind(port)
    server.start(20)
    print(f'Server is running: http://0.0.0.0:{port}')
    tornado.ioloop.IOLoop.current().start()

其中遇到不少的坑，好在最终都搞好了。

要用到的一些包：

https://bj.bcebos.com/paddlehub/paddlehub_dev/ch_pp_ocrv3_det_1_1_0.zip

https://bj.bcebos.com/paddlehub/paddlehub_dev/ch_pp_ocrv3_1.2.0.zip

https://www.python.org/ftp/python/3.9.6/Python-3.9.6.tgz

https://codeload.github.com/PaddlePaddle/PaddleOCR/zip/refs/heads/release/2.7

参考的一些资料：

A sample supervisor config filegist.github.com

https://stackoverflow.com/questions/70963985/error-failed-to-download-metadata-for-repo-appstream-cannot-prepare-internal