123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255 |
- #!/usr/bin/env/python
- # -*- coding:utf-8 -*-
- import os
- import re
- import json
- import datetime
- import time
- import random
- import hashlib
- # import traceback
- import requests
- from retrying import retry
- from PIL import Image
- from flask import make_response
- from func_timeout import func_set_timeout
- from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
- # from photo_upload import upload_replace_image, upload_img_to_ucloud
- from photo_upload_qcloud import upload_replace_image, upload_img_to_ucloud
- from structure.structure_mian import WordParseStructure
- RES_FOLDER = r'D:\zwj\word_non-temp_paper_structure\res_folder'
- server_ip = "49.233.23.58"
- server_file_port = "11088"
- wordbin_path = r"D:\word_bin\ConsoleApplication1.exe"
- def pin(dirpath):
- from xpinyin import Pinyin
- pinyin_converter = Pinyin()
- need_modify = os.path.basename(dirpath)
- res = os.path.dirname(dirpath)
- aft_modify = pinyin_converter.get_pinyin(need_modify, '_')
- aft_modify = re.sub(r"[(())+\-]", "", str(aft_modify))
- aft_modify = re.sub(r"\s", "", str(aft_modify))
- b, h = str(aft_modify).split(".")
- now_time = datetime.datetime.now()
- time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S')
- aft_modify = str(b) + '__' + str(random.random())
- if isinstance(aft_modify, str):
- aft_modify = aft_modify.encode("utf-8")
- md = hashlib.md5()
- md.update(aft_modify)
- aft_name = str(md.hexdigest() + '__' + time_str + '.' + h)
- print("aft_name-----", aft_name)
- bef = os.path.join(res, need_modify)
- aft = os.path.join(res, aft_name)
- os.rename(bef, aft)
- return aft
- # 生成html工具 wordbin
- @retry(stop_max_attempt_number=2, wait_fixed=1) # 最大重试2次,2次全部报错,才会报错
- def call_c_shape(doc_file):
- res = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=70)
- return res
- def start_word2html_app(kill_mathtype=True):
- if kill_mathtype:
- os.system("taskkill /f /im MathType.exe")
- os.system("taskkill /f /im WINWORD.EXE")
- os.system("taskkill /f /im ConsoleApplication1.exe")
- os.system("start {}".format(wordbin_path)) # start 在新窗口中打开
- def check_fault_pid():
- command1 = 'tasklist /fi "imagename eq WINWORD.EXE"'
- command2 = 'tasklist /fi "imagename eq WerFault.exe"'
- r1 = os.popen(command1)
- info1 = r1.read() # 读取命令行的输出到一个list
- if str(info1).strip() != "信息: 没有运行的任务匹配指定标准。":
- print("++++出现office word 宏提醒,开始kill ++++")
- start_word2html_app()
- return 1
- else:
- print("-----没有word问题报告弹窗------")
- r2 = os.popen(command2)
- info2 = r2.read()
- if str(info2).strip() != "信息: 没有运行的任务匹配指定标准。":
- print("++++出现《问题报告》弹窗,开始kill ++++")
- os.system("taskkill /f /im WerFault.exe")
- start_word2html_app()
- return 1
- else:
- print("-----没有wordbin问题报告弹窗------")
- return 0
- def get_html(doc_file): # doc_file:文件绝对路径名
- """调wordbin获取html文件"""
- stime2 = time.time()
- try:
- r = call_c_shape(doc_file)
- # executor1 = ProcessPoolExecutor(3) # wordbin不支持多线程
- # word2html = executor1.submit(call_c_shape, doc_file)
- # word2html = word2html.result()
- # r = word2html.text
- # executor1.shutdown(wait=True)
- if r.text == 4:
- check_fault_pid()
- return "html文件生成失败", 0
- except:
- is_kill = check_fault_pid()
- if not is_kill:
- start_word2html_app(kill_mathtype=True)
- return "试卷格式有问题", 0
- etime2 = time.time()
- try:
- html = open(doc_file.replace(".docx", "_clean.html").replace(".doc", "_clean.html"), 'r', encoding="utf-8")
- return html.read(), etime2 - stime2
- except:
- return "试卷格式有问题", etime2 - stime2 # 也可能超时
- @func_set_timeout(120)
- def parse_word(doc_file, filename_root, flag=0, sid=0):
- print("-------解析的文件名为: {} -------".format(doc_file))
- html, wordbin_time = get_html(doc_file)
- print('解析中wordbin服务时间:', wordbin_time)
- img_upload_time = 0
- if html in ["html文件生成失败", "试卷格式有问题"]:
- res = {"errcode": 1,
- "errmsgs": "word读取失败。1.请确定试卷版面格式是否为常见的试卷排版格式,删除不规范格式\n2.请尝试将内容粘贴复制到一份空白word再上传\n"
- "3.若上传文档题目太多,请分两次上传\n4.重点检查图片(设为嵌入式)和换行"}
- paper_type = "wordbin报错,未知"
- # log_f.write("\n----{}解析中wordbin服务时间:{}".format(file_name, wordbin_time))
- else:
- num = int(os.path.split(os.path.split(doc_file)[0])[1]) # word文件所在文件夹
- # todo if flag==1 --> upload then upload image and replaced online address; else static
- # http://zxhx-test + .cn-bj.ufileos.com + / + teacher/uploadfiles/wording + / 52 / 2020/04/21 + / + 5e9ea1ec2e28f.png"""
- # flag存在時,sid必存在,上傳圖片到綫上
- # print('flag:', flag)
- images_url = ""
- put_key_list, localfile_list = [], []
- if str(flag) == "1":
- # 图片替换
- data = upload_replace_image(filename_root, sid, html)
- if isinstance(data, dict): # 图片替换失败
- print("图片替换线上地址失败")
- res, paper_type = data, "图片替换线上地址失败,未知"
- return res, wordbin_time, paper_type, img_upload_time
- else:
- images_url, put_key_list, localfile_list = data
- else:
- # html = re.sub(r'<img src="files/', '<img src="' + str(images_url), html)
- images_url = "http://{}:{}/{}/{}/files/".format(server_ip, server_file_port, "ser_static", num)
- print("不用上传图片到cloud")
- try:
- # 开始结构化解析
- res, paper_type = WordParseStructure(html, images_url).structure()
- # 解析成功后再上传图片比较好,节约空间
- print(res["errcode"], str(flag))
- if not res["errcode"] and str(flag) == "1":
- print("开始上传图片到Ucloud,并替换成线上地址")
- stime3 = time.time()
- if not localfile_list: # 直接解析没有图片上传
- pass
- else:
- # TODO 一个进程解析,一个进程上传
- executor1 = ProcessPoolExecutor(5)
- executor1.map(upload_img_to_ucloud, zip(put_key_list, localfile_list))
- executor1.shutdown(wait=True) # 进程池内部的进程都执行完毕,才会关闭,然后执行后续代码
- etime3 = time.time()
- img_upload_time = etime3 - stime3
- print("---img_upload_time==>:{}".format(etime3 - stime3))
- except:
- res = {"errcode": 1,
- "errmsgs": "解析失败。1.请确定试卷版面格式是否为常见的试卷排版格式,删除不规范格式\n"
- "2.若上传文档题目太多,请分两次上传\n3.重点检查图片(设为嵌入式)和换行"}
- paper_type = "解析报错,未知"
- print("\n+++++++++解析结果结束++++++++++\n")
- return res, wordbin_time, paper_type, img_upload_time
- def save_post_file(parse_res, fname, id):
- """保存回调的数据解析结果"""
- b, h = str(os.path.basename(fname)).split(".")
- now_time = datetime.datetime.now()
- time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S')
- aft_modify = (str(b) + '__' + str(random.random())).encode("utf-8")
- aft_name = hashlib.md5(aft_modify).hexdigest() + '__' + time_str + '__' + str(id) + '__' + b +'.json'
- file1 = RES_FOLDER
- if not os.path.isdir(file1):
- os.makedirs(file1)
- new_fpath = os.path.join(file1, aft_name)
- re_f = open(new_fpath, 'w', encoding='utf-8')
- json.dump(parse_res, re_f)
- return new_fpath
- def Response_headers(content):
- resp = make_response(content)
- resp.headers['Access-Control-Allow-Headers'] = "x-requested-with,content-type,Authorization"
- resp.headers['Access-Control-Allow-Methods'] = "POST,GET,OPTIONS"
- resp.headers['Access-Control-Allow-Origin'] = '*'
- return resp
- # 修改图片大小
- def resize_img(img1, img2):
- """
- 将图片进行压缩,两个地址相同,均为图片的绝对地址,目前该函数弃用,图片不再进行压缩
- :param img1: D://upload/23.png
- :param img2: D://upload/23.png
- :return:
- """
- im = Image.open(img1)
- if im.size[0] > 4000: # 2020/3/20+
- ratio = 4000 / im.size[0]
- thumb = im.resize((int(im.size[0] * ratio), int(im.size[1] * ratio)), Image.ANTIALIAS)
- thumb.save(img2)
- def img_rep(cont):
- """
- 获取文本中含有的本地图片流
- :param cont:文本
- :return: 图片流
- """
- import base64
- img_stream = ''
- all_photo = re.findall("<img src=\"([^\"]+?)\".*?/>", cont)
- if all_photo:
- for src in all_photo:
- img_path = src.split(" ")[0]
- img_local_path = "F:/word_uploads/" + img_path.split("ser_static/")[1].replace("\"", "")
- print(img_local_path)
- with open(img_local_path, 'rb') as img_f:
- img_stream = img_f.read()
- img_stream = base64.b64encode(img_stream)
- cont = cont.replace(src, "")
- requests.post(img_path)
- return cont
- def return_img_stream(img_local_path):
- """
- 工具函数:
- 获取本地图片流
- :param img_local_path:文件单张图片的本地绝对路径
- :return: 图片流
- """
- import base64
- img_stream = ''
- with open(img_local_path, 'rb') as img_f:
- img_stream = img_f.read()
- img_stream = base64.b64encode(img_stream).decode()
- return img_stream
|