#!/usr/bin/env/python # -*- coding:utf-8 -*- import os import re import json import datetime import time import random import hashlib import requests from retrying import retry from PIL import Image from flask import make_response from func_timeout import func_set_timeout from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor # from photo_upload import upload_replace_image, upload_img_to_ucloud from photo_upload_qcloud import upload_replace_image, upload_img_to_ucloud from structure_mian import WordParseStructure import ps_configs as config from ps_configs import myLog logger = myLog(__name__).getlog() def pin(dirpath): from xpinyin import Pinyin pinyin_converter = Pinyin() need_modify = os.path.basename(dirpath) res = os.path.dirname(dirpath) aft_modify = pinyin_converter.get_pinyin(need_modify, '_') aft_modify = re.sub(r"[(())+\-]", "", str(aft_modify)) aft_modify = re.sub(r"\s", "", str(aft_modify)) b, h = str(aft_modify).split(".") now_time = datetime.datetime.now() time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S') aft_modify = str(b) + '__' + str(random.random()) if isinstance(aft_modify, str): aft_modify = aft_modify.encode("utf-8") md = hashlib.md5() md.update(aft_modify) aft_name = str(md.hexdigest() + '__' + time_str + '.' + h) print("aft_name-----", aft_name) bef = os.path.join(res, need_modify) aft = os.path.join(res, aft_name) os.rename(bef, aft) return aft # 生成html工具 wordbin @retry(stop_max_attempt_number=2, wait_fixed=1) # 最大重试2次,2次全部报错,才会报错 def call_c_shape(doc_file): res = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=70) return res def start_word2html_app(kill_mathtype=True): if kill_mathtype: os.system("taskkill /f /im MathType.exe") os.system("taskkill /f /im WINWORD.EXE") os.system("taskkill /f /im ConsoleApplication1.exe") # os.system('"C:\Program Files (x86)\MathType\MathType.exe" -server') os.system("start {}".format(config.wordbin_exe)) # start 在新窗口中打开 def check_fault_pid(): command1 = 'tasklist /fi "imagename eq WINWORD.EXE"' command2 = 'tasklist /fi "imagename eq WerFault.exe"' r1 = os.popen(command1) info1 = r1.read() # 读取命令行的输出到一个list if str(info1).strip() != "信息: 没有运行的任务匹配指定标准。": print("++++出现office word 宏提醒,开始kill ++++") start_word2html_app() return 1 else: print("-----没有word问题报告弹窗------") r2 = os.popen(command2) info2 = r2.read() if str(info2).strip() != "信息: 没有运行的任务匹配指定标准。": print("++++出现《问题报告》弹窗,开始kill ++++") os.system("taskkill /f /im WerFault.exe") start_word2html_app() return 1 else: print("-----没有wordbin问题报告弹窗------") return 0 def get_html(doc_file): # doc_file:文件绝对路径名 """调wordbin获取html文件""" stime2 = time.time() try: r = call_c_shape(doc_file) # executor1 = ProcessPoolExecutor(3) # wordbin不支持多线程 # word2html = executor1.submit(call_c_shape, doc_file) # word2html = word2html.result() # r = word2html.text # executor1.shutdown(wait=True) if r.text == 4: check_fault_pid() return "html文件生成失败", 0 except: is_kill = check_fault_pid() if not is_kill: start_word2html_app(kill_mathtype=True) return "试卷格式有问题", 0 etime2 = time.time() try: html = open(doc_file.replace(".docx", "_clean.html").replace(".doc", "_clean.html"), 'r', encoding="utf-8") return html.read(), etime2 - stime2 except: return "试卷格式有问题", etime2 - stime2 # 也可能超时 @func_set_timeout(120) def parse_word(doc_file, filename_root, flag=0, sid=0, upload_id='0000', consumer='phy'): # if consumer=='phy': # from parse_v1.non_template_word_parse_phy import WordParseStructure # else: # from parse_v2.non_template_word_parse_new import WordParseStructure logger.info("----【upload_id:{}】开始解析文件: {} -------".format(upload_id, doc_file)) html, wordbin_time = get_html(doc_file) logger.info('----【upload_id:{}】解析中wordbin服务时间:{}'.format(upload_id, wordbin_time)) if html in ["html文件生成失败", "试卷格式有问题"]: res = {"errcode": 1, "errmsgs": "word读取失败。1.请确定试卷版面格式是否为常见的试卷排版格式,删除不规范格式\n2.请尝试将内容粘贴复制到一份空白word再上传\n" "3.若上传文档题目太多,请分两次上传\n4.重点检查图片(设为嵌入式)和换行"} logger.info('----【upload_id:{}】wordbin报错----'.format(upload_id)) return res, "wordbin报错,未知" # log_f.write("\n----{}解析中wordbin服务时间:{}".format(file_name, wordbin_time)) else: num = int(os.path.split(os.path.split(doc_file)[0])[1]) # word文件所在文件夹 # todo if flag==1 --> upload then upload image and replaced online address; else static # http://zxhx-test + .cn-bj.ufileos.com/teacher/uploadfiles/wording + /52/2020/04/21 + /5e9ea1ec2e28f.png # flag存在時,sid必存在,上傳圖片到綫上 # print('flag:', flag) images_url = "http://{}:{}/{}/{}/files/".format(config.server_ip, config.server_port, "ser_static", num) put_key_list, localfile_list = [], [] if str(flag) == "1": # 图片替换 data = upload_replace_image(filename_root, sid, html) if isinstance(data, dict): # 图片替换失败 # print("图片替换线上地址失败") logger.info('----【upload_id:{}】图片替换线上地址失败----'.format(upload_id)) return data, "图片替换线上地址失败,未知" else: images_url, put_key_list, localfile_list = data try: # 开始结构化解析 res, paper_type = WordParseStructure(html, images_url).structure() # pprint.pprint(res) # 解析成功后再上传图片比较好,节约空间 if not res["errcode"] and str(flag) == "1": # print("开始上传图片到cloud,并替换成线上地址") logger.info('----【upload_id:{}】开始上传图片到cloud,并替换成线上地址----'.format(upload_id)) stime3 = time.time() if not localfile_list: # 直接解析没有图片上传 pass else: # TODO 一个进程解析,一个进程上传 executor1 = ProcessPoolExecutor(5) executor1.map(upload_img_to_ucloud, zip(put_key_list, localfile_list)) executor1.shutdown(wait=True) # 进程池内部的进程都执行完毕,才会关闭,然后执行后续代码 etime3 = time.time() img_upload_time = etime3 - stime3 logger.info("----【upload_id:{}】,图片上传时间img_upload_time:{}".format(upload_id, img_upload_time)) except: logger.info('----【upload_id:{}】试题结构化或图片上传cloud报错----'.format(upload_id)) res = {"errcode": 1, "errmsgs": "解析失败。1.请确定试卷版面格式是否为常见的试卷排版格式,删除不规范格式\n" "2.若上传文档题目太多,请分两次上传\n3.重点检查图片(设为嵌入式)和换行"} paper_type = "解析报错,未知" print("\n+++++++++解析结果结束++++++++++\n") return res, paper_type def save_post_file(parse_res, fname, id): """保存回调的数据解析结果""" b, h = str(os.path.basename(fname)).split(".") now_time = datetime.datetime.now() time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S') aft_modify = (str(b) + '__' + str(random.random())).encode("utf-8") aft_name = hashlib.md5(aft_modify).hexdigest() + '__' + time_str + '__' + str(id) + '__' + b +'.json' file1 = config.RES_FOLDER if not os.path.isdir(file1): os.makedirs(file1) new_fpath = os.path.join(file1, aft_name) re_f = open(new_fpath, 'w', encoding='utf-8') json.dump(parse_res, re_f) return new_fpath def Response_headers(content): resp = make_response(content) resp.headers['Access-Control-Allow-Headers'] = "x-requested-with,content-type,Authorization" resp.headers['Access-Control-Allow-Methods'] = "POST,GET,OPTIONS" resp.headers['Access-Control-Allow-Origin'] = '*' return resp # 修改图片大小 def resize_img(img1, img2): """ 将图片进行压缩,两个地址相同,均为图片的绝对地址,目前该函数弃用,图片不再进行压缩 :param img1: D://upload/23.png :param img2: D://upload/23.png :return: """ im = Image.open(img1) if im.size[0] > 4000: # 2020/3/20+ ratio = 4000 / im.size[0] thumb = im.resize((int(im.size[0] * ratio), int(im.size[1] * ratio)), Image.ANTIALIAS) thumb.save(img2) def img_rep(cont): """ 获取文本中含有的本地图片流 :param cont:文本 :return: 图片流 """ import base64 img_stream = '' all_photo = re.findall("", cont) if all_photo: for src in all_photo: img_path = src.split(" ")[0] img_local_path = "F:/word_uploads/" + img_path.split("ser_static/")[1].replace("\"", "") print(img_local_path) with open(img_local_path, 'rb') as img_f: img_stream = img_f.read() img_stream = base64.b64encode(img_stream) cont = cont.replace(src, "") requests.post(img_path) return cont def return_img_stream(img_local_path): """ 工具函数: 获取本地图片流 :param img_local_path:文件单张图片的本地绝对路径 :return: 图片流 """ import base64 img_stream = '' with open(img_local_path, 'rb') as img_f: img_stream = img_f.read() img_stream = base64.b64encode(img_stream).decode() return img_stream