import datetime import json import os import random import time import traceback from glob import glob from pprint import pprint from time import sleep from multiprocessing import Process, Queue from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor import requests from bottle import redirect from bottle import request from bottle import route, run from bottle import static_file from bottle import error from func_timeout import func_set_timeout import config from parse import Mathtest from utils import get_dir_next_num import re import logging from ufile import config as ufile_config from ufile import logger as img_log from ufile import filemanager import hashlib logger = logging.getLogger(__name__) logger.setLevel(level=logging.INFO) log_dir = '../logs' try: os.mkdir(log_dir) except: pass log_file = os.path.join(log_dir, 'parse_log.txt') # log_file = os.path.join(log_dir, 'parse_time.txt') handler = logging.FileHandler(log_file, mode='a', encoding='utf-8', delay=True) handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) logger.info("Start print log") # logger.debug("Do something") # logger.warning("Something maybe fail.") # logger.info("Finish") ''' errcode=0成功 errcode=1失败 errmsg是具体的失败消息 ''' # 定义上传路径 save_path = '../upload' if not os.path.isdir(save_path): os.mkdir(save_path) # todo 本地替換圖片url,flag=0 src_pat = re.compile(r'src\s*=\s*"files') public_key = 'ucloudyunkaopei@outlook.com13615403931104805307' private_key = 'bcfd5bb66ca527c9be9fd7f3e784fbfc90c4bba5' image_upload_log = '../logs/image_log.txt' # 图片上传日志 img_log.set_log_file(image_upload_log) addr = '.cn-bj.ufileos.com' # 后缀拼接 ufile_config.set_default(uploadsuffix=addr) # todo 綫上正式環境為'zxhx' # public_bucket = 'zxhx' # 公共空间名称 # todo 綫上測試環境為'zxhx-test' public_bucket = 'zxhx-test' # private_bucket = '' # 私有空间名称 @route('/') def index(): return redirect("/upload") # # @route('/hello') # def hello(): # return "hello" # 文件上传的HTML模板,这里没有额外去写html模板了,直接写在这里,方便点吧 @route('/upload') def upload(): return '''
''' def call_c_shape(doc_file): r = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=config.call_c_shape_time_out) return r def start_word2html_app(kill_mathtype=True): if kill_mathtype: os.system("taskkill /f /im MathType.exe") os.system("taskkill /f /im WINWORD.EXE") os.system("taskkill /f /im ConsoleApplication1.exe") os.system("start {}".format(config.word2html_exe)) # start 在新窗口中打开 def check_pid(): command = 'tasklist /fi "imagename eq WINWORD.EXE"' r = os.popen(command) info = r.read() # 读取命令行的输出到一个list # print(info) if str(info).strip() != "信息: 没有运行的任务匹配指定标准。": print("++++再补一刀++++") start_word2html_app() else: print("-----office is killed------") @func_set_timeout(90) def parse_word(doc_file, filename_root, flag=0, sid=0): time4 = time.time() try: r = call_c_shape(doc_file) # executor1 = ProcessPoolExecutor(3) # word2html = executor1.submit(call_c_shape, doc_file) # word2html = word2html.result() # r = word2html.text # executor1.shutdown(wait=True) except: r = None start_word2html_app() print("+++++++++++++++time out of read word++++++++++++++++++") time44 = time.time() - time4 logger.info("---word2html_time---==>" + str(time44)) if r is None or str(r.text) == "4": # 再次确认office有没有杀死,没有则再补一刀 check_pid() return {"errcode": 1, "errmsg": "word读取失败。请尝试:\n 1.请将word中图片改为嵌入式,然后上传。\n 2. 请将word题目拆分为多个word上传。\n 3.请将word中内容复制到新word上传。"} print("+++++++word to html, r={}, r.text={}++++++++".format(r, r.text)) clean_html_file = os.path.splitext(doc_file)[0] + "_clean.html" if str(r.text) == "4" or not os.path.isfile(clean_html_file): # 再次确认office有没有杀死,没有则再补一刀 check_pid() return {"errcode": 1, "errmsg": "word读取失败。请尝试:\n 1.请将word中图片改为嵌入式,然后上传。\n 2. 请将word题目拆分为多个word上传。\n 3.请将word中内容复制到新word上传。"} with open(clean_html_file, "r", encoding="utf-8") as f: html = f.read() num = int(os.path.split(os.path.split(doc_file)[0])[1]) ip = config.external_ip # todo if flag==1 --> upload then upload image and replaced online address; else static # http://zxhx-test + .cn-bj.ufileos.com + / + teacher/uploadfiles/wording + / 52 / 2020/04/21 + / + 5e9ea1ec2e28f.png""" # flag存在時,sid必存在,上傳圖片到綫上 if str(flag) == "1": print("+++++++++开始上传图片到Ucloud,并替换成线上地址++++++++++") data = upload_replace_image(filename_root, sid, html) if isinstance(data, dict): return data html, put_key_list, localfile_list = data # 直接解析没有图片上传 if not localfile_list: pass else: # TODO 一个进程解析,一个进程上传 time3 = time.time() executor1 = ProcessPoolExecutor(5) executor1.map(upload_img_to_ucloud, zip(put_key_list, localfile_list)) executor1.shutdown(wait=True) time33 = time.time() - time3 logger.info("---img_upload_time==>:{}".format(str(time33))) else: html = src_pat.sub(r'src="http://{}:{}/{}/{}/files'.format(ip, config.file_port, "static", num), html) m = Mathtest(html) return m.tojson() def get_md5(image): """ 由于hash不处理unicode编码的字符串(python3默认字符串是unicode) 所以这里判断是否字符串,如果是则进行转码 初始化md5、将image_name进行加密、然后返回加密字串 """ image_name, image_type = str(image).split(".") image_name = str(image_name) + str(time.time()) + str(random.random()) if isinstance(image_name, str): image_name = image_name.encode("utf-8") md = hashlib.md5() md.update(image_name) # a = time.time() # b = random.random() return str(md.hexdigest()) + "." + str(image_type) # todo 上传图片进程 def upload_img_to_ucloud(param_ucloud): put_key, localfile = param_ucloud putufile_handler = filemanager.FileManager(public_key, private_key) # 普通上传文件至公共空间 ret, resp = putufile_handler.putfile(public_bucket, put_key, localfile, header=None) assert resp.status_code == 200 def upload_replace_image(filename_root, sid, html): return_error = {"errcode": 1, "errmsg": "word图片上传失败。"} daytime = datetime.datetime.now().strftime('/%Y/%m/%d/') image_path = filename_root + "/files" # todo 判断试卷是否含有图片,如果有就替换上传,没有就不处理 judge_file = os.path.isdir(image_path) if judge_file: image_number = re.findall(r'{}".format(str(local_images_path_list))) put_key_list = [] localfile_list = [] if local_images_path_list: try: for i, img in enumerate(local_images_path_list, start=1): # 所有的图片【image1.png,image2.png,。。。】 src_pat2 = re.compile(r':{}".format(str(time22))) # print("解析结果为") # pprint(res) print("------------parse is successful---------------") except: print(traceback.print_exc()) print("+++++++++++++++have callback_url,but time out of parse_word++++++++++++++++++") res = {"time_out": 90, "id": 0, "errcode": 1, "errmsg": "word读取失败。请尝试:\n 1.请将word中图片改为嵌入式,然后上传。\n 2. 请将word题目拆分为多个word上传。\n 3.请将word中内容复制到新word上传。"} # print("解析结果为:\n") # pprint(res) print("------------parse is fail---------------") headers = {'Content-Type': 'application/json', } try: response = requests.post(callback_url, # json=res, # 可以,但是会进行转义 data=json.dumps(res, ensure_ascii=False).encode("utf-8"), headers=headers, timeout=10 ) print("callback response {}".format(response.status_code)) except: print("error callback") def step(filename, filename_root, callback_url, flag, sid): executor = ThreadPoolExecutor(max_workers=10) executor.submit(multi_parse, (filename, filename_root, callback_url, flag, sid)) def read(q): while True: print("+++++++++", q.qsize()) content = q.get(True) # todo huancun filename, filename_root, callback_url, flag, sid = content # todo huancun logger.info("\n\n*********parse is action***********\n") logger.info('++++Get filename={}, filename_root={}, callback_url={}, flag ={},sid ={} from queue\n\n' .format(filename, filename_root, callback_url, flag, sid)) # 单线程解析 multi_parse(content) # 文件上传,overwrite=True为覆盖原有的文件, # 如果不加这参数,当服务器已存在同名文件时,将返回“IOError: File exists.”错误 @route('/upload', method='POST') def do_upload(): logger.info("==request.POST.dict==>{}".format(request.POST.dict)) if "callback_url" in request.POST.dict: callback_url = request.POST.dict["callback_url"][0] else: callback_url = "" time1 = time.time() upload = request.files.get('mydata') # todo add flag and sid flag = request.POST.dict.get('flag', [0])[0] sid = request.POST.dict.get('sid', [0])[0] # logger.info("------flag==>: {}".format(str(flag))) # logger.info("------sid==>: {}".format(str(sid))) # logger.info("------mydata==>: {}".format(str(upload))) # logger.info("------callback_url==>: {}".format(str(callback_url))) if os.path.splitext(upload.raw_filename)[1] not in [".doc", ".docx"]: return "only accept .doc .docx files" num_str = str(get_dir_next_num(save_path)) cur_save_path = os.path.join(save_path, num_str) os.mkdir(cur_save_path) upload.raw_filename = num_str + os.path.splitext(upload.raw_filename)[1] upload.save(cur_save_path, overwrite=True) # 把文件保存到save_path路径下 filename = os.path.join(cur_save_path, upload.raw_filename) filename = os.path.abspath(filename) filename_root = os.path.abspath(cur_save_path) logger.info("---filename_root==>:{}".format(str(filename_root))) logger.info("---filename==>:{}".format(str(filename))) time11 = time.time() - time1 logger.info("---save_filename_time==>:{}".format(str(time11))) # 开一个进程对filename 进行解析 if callback_url: q_parse_file.put([filename_root, filename, callback_url, flag, sid]) print("-----当前还有{}-----".format(q_parse_file.qsize() - 1)) return json.dumps({ "errcode": 0, "errmsg": "OK", "docsbefore": q_parse_file.qsize() - 1, }, ensure_ascii=False).encode("utf-8") else: try: res = parse_word(filename, None) # print("\n解析结果===>\n{}\n\n".format(res)) print("------------parse is successful---------------") except: print("+++++++++++++++time out of parse_word++++++++++++++++++") res = {"time_out": 3, "errcode": 1, "errmsg": "word读取失败。请尝试:\n 1.请将word中图片改为嵌入式,然后上传。\n 2. 请将word题目拆分为多个word上传。\n 3.请将word中内容复制到新word上传。"} return json.dumps(res, ensure_ascii=False, indent=4).encode("utf-8") from PIL import Image def resize_img(img1, img2): im = Image.open(img1) if im.size[0] > 1000: ratio = 1000 / im.size[0] thumb = im.resize((int(im.size[0] * ratio), int(im.size[1] * ratio)), Image.ANTIALIAS) thumb.save(img2) @route('/static/', method='GET') def server_static(filepath): # 5/files/image6.png # print("static_file: ^{}$".format(filepath)) # logger.info("static_file: ^{}$".format(filepath)) file = os.path.join(save_path, filepath) if "image" in file: resize_img(file, file) return static_file(filepath, root=save_path) static_image_pat = re.compile( r'(static/\d+/files/image\d+\.(webp|bmp|pcx|tiff|gif|jpeg|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|png|hdri|raw|wmf|flic|emf|ico))') @error(404) def error404(error): request_str = str(request) # '' if 'favicon.ico' in request_str: return 'Nothing here, sorry' # request_str = '' m = static_image_pat.search(request_str) if m: filepath = m.group(1) # static/83/files/image2.png filepath = filepath.replace("static", "upload") if os.path.isfile(filepath): return "Nothing here, sorry" print("error404: remote_addr={}, request={}".format(str(request.remote_addr), request_str)) # logger.error("error404: remote_addr={}, request={}".format(str(request.remote_addr), request_str)) return 'your ip: {}
' \ 'your request: ^{}$
' \ 'Nothing here, sorry
'.format(str(request.remote_addr), request_str.replace("<", "").replace(">", "")) if __name__ == "__main__": # 解析文件的队列 prs = [] q_parse_file = Queue() start_word2html_app(kill_mathtype=True) # 读取解析进程 # for i in range(5): # will call c shape word2html interface pr = Process(target=read, args=(q_parse_file,)) pr.start() prs.append(pr) print("parse pid:", pr.pid) # sleep(2) # # 图片供应进程 # pr = Process(target=run, args=(), # kwargs={"host": config.internal_ip, "port": config.file_port, "server": "tornado"}) # pr.start() # prs.append(pr) # print("image pid:", pr.pid) # sleep(2) # 响应请求 print("main pid:", os.getpid()) print("main port:", config.server_port) # delete files # print("\n--------delete file process is start--------\n") # dojob() run(host=config.internal_ip, port=config.server_port, server="tornado")