有需求需要内网部署ocr服务,多方对比感觉paddle百度飞桨各方面还是不错的。
https://www.paddlepaddle.org.cn/hubdetail?name=ch_pp-ocrv3&en_category=TextRecognition
我的需求场景是内网的容器部署,编译底包也是限定死的只能用centos(centos8)系统。
所以尝试从头打包paddleocr的镜像。
先贴配置文件:
Dockerfile
FROM centos:8.3.2011 RUN sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* RUN sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* RUN yum update -y RUN yum install gcc make python3-devel openssl-devel mesa-libGL bzip2-devel libffi-devel zlib-devel xz-devel sqlite-devel unzip -y COPY Python-3.9.6.tgz /Python-3.9.6.tgz RUN tar -zxvf /Python-3.9.6.tgz RUN cd /Python-3.9.6 RUN /Python-3.9.6/configure --enable-loadable-sqlite-extensions --enable-optimizations RUN make altinstall RUN pip3.9 install --upgrade pip COPY PaddleOCR-release-2.7.zip /PaddleOCR-release-2.7.zip RUN unzip /PaddleOCR-release-2.7.zip -d / RUN mv /PaddleOCR-release-2.7 /PaddleOCR # RUN mkdir /PaddleOCR/inference # ADD ch_ppocr_mobile_v2.0_cls_infer.tar /PaddleOCR/inference/ # ADD ch_ppocr_mobile_v2.0_det_infer.tar /PaddleOCR/inference/ # ADD ch_ppocr_mobile_v2.0_rec_infer.tar /PaddleOCR/inference/ # RUN mv /PaddleOCR/inference/ch_ppocr_mobile_v2.0_det_infer /PaddleOCR/inference/ch_PP-OCRv3_det_infer # RUN mv /PaddleOCR/inference/ch_ppocr_mobile_v2.0_rec_infer /PaddleOCR/inference/ch_PP-OCRv3_rec_infer WORKDIR /PaddleOCR RUN pip install backports.lzma RUN pip install PyMuPDF==1.18.0 RUN pip install -r /PaddleOCR/requirements.txt RUN pip install paddlepaddle RUN pip install paddlehub # RUN hub install deploy/hubserving/ocr_system/ # RUN hub install deploy/hubserving/ocr_cls/ # RUN hub install deploy/hubserving/ocr_det/ # RUN hub install deploy/hubserving/ocr_rec/ RUN mkdir -p /root/.paddlehub/modules ADD ch_pp_ocrv3_1.2.0.zip /tmp/ ADD ch_pp_ocrv3_det_1_1_0.zip /tmp RUN unzip /tmp/ch_pp_ocrv3_1.2.0.zip -d /root/.paddlehub/modules/ RUN unzip /tmp/ch_pp_ocrv3_det_1_1_0.zip -d /root/.paddlehub/modules/ RUN pip install waitress tornado ADD html /PaddleOCR/html ADD html/app_compat.py /usr/local/lib/python3.9/site-packages/paddlehub/serving/app_compat.py RUN yum install epel-release -y RUN yum install supervisor -y ADD supervisord.conf /supervisord.conf EXPOSE 9000 # CMD ["/bin/bash","-c","hub serving start --modules ocr_system ocr_cls ocr_det ocr_rec -p 9000"] # CMD ["/bin/bash","-c","hub serving start --modules ch_pp-ocrv3 -p 9000"] # CMD ["/bin/bash","-c","python3.9 html/main.py"] CMD ["supervisord","-c","/supervisord.conf"]
supervisord.conf
[supervisord] logfile=/tmp/supervisord.log ; (main log file;default $CWD/supervisord.log) logfile_maxbytes=50MB ; (max main logfile bytes b4 rotation;default 50MB) logfile_backups=10 ; (num of main logfile rotation backups;default 10) loglevel=info ; (log level;default info; others: debug,warn,trace) pidfile=/var/run/supervisord.pid ; (supervisord pidfile;default supervisord.pid) nodaemon=false ; (start in foreground if true;default false) minfds=1024 ; (min. avail startup file descriptors;default 1024) minprocs=200 ; (min. avail process descriptors;default 200) [program:paddlehub] autorestart=True ; 程序异常退出后自动重启 autostart=True ; 在 supervisord 启动的时候也自动启动 process_name=paddlehub command=python3.9 /PaddleOCR/html/main.py directory=/PaddleOCR user=admin
main.py
# encoding: utf-8 # author:alisen # time: 2020/4/28 14:54 import os import json import base64 from tornado.options import define, options import tornado.web import tornado.gen import tornado.template import tornado.httpserver import tornado.ioloop from paddlehub.serving.model_service.base_model_service import cv_module_info from paddlehub.serving.model_service.base_model_service import nlp_module_info from paddlehub.serving.model_service.base_model_service import v2_module_info from paddlehub.utils import log import paddlehub as hub import numpy as np import cv2 ocr = hub.Module(name="ch_pp-ocrv3", enable_mkldnn=True) def package_result(status: str, msg: str, data: dict): ''' Package message of response. Args: status(str): Error code ======== ============================================================================================== Code Meaning -------- ---------------------------------------------------------------------------------------------- '000' Return results normally '101' An error occurred in the predicting method '111' Module is not available '112' Use outdated and abandoned HTTP protocol format ======== =============================================================================================== msg(str): Detailed info for error data(dict): Result of predict api. Returns: dict: Message of response Examples: .. code-block:: python data = {'result': 0.002} package_result(status='000', msg='', data=data) ''' return {"status": status, "msg": msg, "results": data} class NpEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.integer): return int(obj) elif isinstance(obj, np.floating): return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() else: return super(NpEncoder, self).default(obj) class Index(tornado.web.RequestHandler): @tornado.gen.coroutine def get(self, *args, **kwargs): self.render('/PaddleOCR/html/index.html') class Predict(tornado.web.RequestHandler): @tornado.gen.coroutine def post(self): ''' Http api for predicting. Args: module_name(str): Module name for predicting. Returns: Result of predicting after packaging. ''' self.set_header('content-type', 'application/json') img_b64 = json.loads(self.request.body)['images'][0] if img_b64 is None: message = "This usage is out of date, please use 'application/json' as content-type to post to /predict" self.set_status(400) self.finish(json.dumps(package_result("112", message, ""), cls=NpEncoder)) return im_bytes = base64.b64decode(img_b64) im_arr = np.frombuffer(im_bytes, dtype=np.uint8) source = cv2.imdecode(im_arr, flags=cv2.IMREAD_COLOR) result = ocr.recognize_text(images=[source]) self.set_status(200) self.finish(json.dumps(package_result("000", "", result), ensure_ascii=False)) return def make_app(): # from backend.webInterface import tr_run return tornado.web.Application([ (r"/predict/ch_pp-ocrv3", Predict), (r"/", Index) ]) if __name__ == "__main__": define("port", default=9000, type=int, help='指定运行时端口号') tornado.options.parse_command_line() port = options.port app = make_app() server = tornado.httpserver.HTTPServer(app) server.bind(port) server.start(20) print(f'Server is running: http://0.0.0.0:{port}') tornado.ioloop.IOLoop.current().start()
其中遇到不少的坑,好在最终都搞好了。
要用到的一些包:
https://bj.bcebos.com/paddlehub/paddlehub_dev/ch_pp_ocrv3_det_1_1_0.zip
https://bj.bcebos.com/paddlehub/paddlehub_dev/ch_pp_ocrv3_1.2.0.zip
https://www.python.org/ftp/python/3.9.6/Python-3.9.6.tgz
https://codeload.github.com/PaddlePaddle/PaddleOCR/zip/refs/heads/release/2.7
参考的一些资料:
A sample supervisor config filegist.github.com