cdZWj
/
new_tiku_structure_2021


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
							#!/usr/bin/env/python
# -*- coding:utf-8 -*-

import os
import re
import json
import datetime
import time
import random
import hashlib
# import traceback
import requests
from retrying import retry
from PIL import Image
from flask import make_response
from func_timeout import func_set_timeout
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
# from photo_upload import upload_replace_image, upload_img_to_ucloud
from photo_upload_qcloud import upload_replace_image, upload_img_to_ucloud
from structure.structure_mian import WordParseStructure

RES_FOLDER = r'D:\zwj\word_non-temp_paper_structure\res_folder'
server_ip = "49.233.23.58"
server_file_port = "11088"
wordbin_path = r"D:\word_bin\ConsoleApplication1.exe"


def pin(dirpath):
    from xpinyin import Pinyin
    pinyin_converter = Pinyin()
    need_modify = os.path.basename(dirpath)
    res = os.path.dirname(dirpath)
    aft_modify = pinyin_converter.get_pinyin(need_modify, '_')
    aft_modify = re.sub(r"[(（)）+\-]", "", str(aft_modify))
    aft_modify = re.sub(r"\s", "", str(aft_modify))

    b, h = str(aft_modify).split(".")
    now_time = datetime.datetime.now()
    time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S')
    aft_modify = str(b) + '__' + str(random.random())
    if isinstance(aft_modify, str):
        aft_modify = aft_modify.encode("utf-8")
    md = hashlib.md5()
    md.update(aft_modify)
    aft_name = str(md.hexdigest() + '__' + time_str + '.' + h)
    print("aft_name-----", aft_name)
    bef = os.path.join(res, need_modify)
    aft = os.path.join(res, aft_name)
    os.rename(bef, aft)
    return aft


# 生成html工具 wordbin
@retry(stop_max_attempt_number=2, wait_fixed=1)  # 最大重试2次，2次全部报错，才会报错
def call_c_shape(doc_file):
    res = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=70)
    return res


def start_word2html_app(kill_mathtype=True):
    if kill_mathtype:
        os.system("taskkill /f /im MathType.exe")
        os.system("taskkill /f /im WINWORD.EXE")
        os.system("taskkill /f /im ConsoleApplication1.exe")
    os.system("start {}".format(wordbin_path))  # start 在新窗口中打开


def check_fault_pid():
    command1 = 'tasklist /fi  "imagename eq WINWORD.EXE"'
    command2 = 'tasklist /fi  "imagename eq WerFault.exe"'
    r1 = os.popen(command1)
    info1 = r1.read()  # 读取命令行的输出到一个list
    if str(info1).strip() != "信息: 没有运行的任务匹配指定标准。":
        print("++++出现office word 宏提醒，开始kill ++++")
        start_word2html_app()
        return 1
    else:
        print("-----没有word问题报告弹窗------")
        r2 = os.popen(command2)
        info2 = r2.read()
        if str(info2).strip() != "信息: 没有运行的任务匹配指定标准。":
            print("++++出现《问题报告》弹窗，开始kill ++++")
            os.system("taskkill /f /im WerFault.exe")
            start_word2html_app()
            return 1
        else:
            print("-----没有wordbin问题报告弹窗------")
            return 0


def get_html(doc_file):  # doc_file:文件绝对路径名
    """调wordbin获取html文件"""
    stime2 = time.time()
    try:
        r = call_c_shape(doc_file)
        # executor1 = ProcessPoolExecutor(3)  # wordbin不支持多线程
        # word2html = executor1.submit(call_c_shape, doc_file)
        # word2html = word2html.result()
        # r = word2html.text
        # executor1.shutdown(wait=True)
        if r.text == 4:
            check_fault_pid()
            return "html文件生成失败", 0
    except:
        is_kill = check_fault_pid()
        if not is_kill:
            start_word2html_app(kill_mathtype=True)
        return "试卷格式有问题", 0

    etime2 = time.time()
    try:
        html = open(doc_file.replace(".docx", "_clean.html").replace(".doc", "_clean.html"), 'r', encoding="utf-8")
        return html.read(), etime2 - stime2
    except:
        return "试卷格式有问题", etime2 - stime2  # 也可能超时


@func_set_timeout(120)
def parse_word(doc_file, filename_root, flag=0, sid=0):
    print("-------解析的文件名为: {} -------".format(doc_file))
    html, wordbin_time = get_html(doc_file)
    print('解析中wordbin服务时间:', wordbin_time)
    img_upload_time = 0
    if html in ["html文件生成失败", "试卷格式有问题"]:
        res = {"errcode": 1,
               "errmsgs": "word读取失败。1.请确定试卷版面格式是否为常见的试卷排版格式,删除不规范格式\n2.请尝试将内容粘贴复制到一份空白word再上传\n"
                          "3.若上传文档题目太多，请分两次上传\n4.重点检查图片(设为嵌入式)和换行"}
        paper_type = "wordbin报错，未知"
        # log_f.write("\n----{}解析中wordbin服务时间：{}".format(file_name, wordbin_time))
    else:
        num = int(os.path.split(os.path.split(doc_file)[0])[1])  # word文件所在文件夹
        # todo if flag==1 --> upload then upload image and replaced online address; else static
        # http://zxhx-test + .cn-bj.ufileos.com + / + teacher/uploadfiles/wording + / 52 / 2020/04/21 + / + 5e9ea1ec2e28f.png"""
        # flag存在時，sid必存在,上傳圖片到綫上
        # print('flag:', flag)
        images_url = ""
        put_key_list, localfile_list = [], []
        if str(flag) == "1":
            # 图片替换
            data = upload_replace_image(filename_root, sid, html)
            if isinstance(data, dict):  # 图片替换失败
                print("图片替换线上地址失败")
                res, paper_type = data, "图片替换线上地址失败，未知"
                return res, wordbin_time, paper_type, img_upload_time
            else:
                images_url, put_key_list, localfile_list = data
        else:
            # html = re.sub(r'<img src="files/', '<img src="' + str(images_url), html)
            images_url = "http://{}:{}/{}/{}/files/".format(server_ip, server_file_port, "ser_static", num)
            print("不用上传图片到cloud")

        try:
            # 开始结构化解析
            res, paper_type = WordParseStructure(html, images_url).structure()
            # 解析成功后再上传图片比较好,节约空间
            print(res["errcode"], str(flag))
            if not res["errcode"] and str(flag) == "1":
                print("开始上传图片到Ucloud，并替换成线上地址")
                stime3 = time.time()
                if not localfile_list:  # 直接解析没有图片上传
                    pass
                else:
                    # TODO 一个进程解析，一个进程上传
                    executor1 = ProcessPoolExecutor(5)
                    executor1.map(upload_img_to_ucloud, zip(put_key_list, localfile_list))
                    executor1.shutdown(wait=True)  # 进程池内部的进程都执行完毕，才会关闭，然后执行后续代码
                    etime3 = time.time()
                    img_upload_time = etime3 - stime3
                    print("---img_upload_time==>:{}".format(etime3 - stime3))
        except:
            res = {"errcode": 1,
                   "errmsgs": "解析失败。1.请确定试卷版面格式是否为常见的试卷排版格式,删除不规范格式\n"
                              "2.若上传文档题目太多，请分两次上传\n3.重点检查图片(设为嵌入式)和换行"}
            paper_type = "解析报错，未知"

    print("\n+++++++++解析结果结束++++++++++\n")
    return res, wordbin_time, paper_type, img_upload_time


def save_post_file(parse_res, fname, id):
    """保存回调的数据解析结果"""
    b, h = str(os.path.basename(fname)).split(".")
    now_time = datetime.datetime.now()
    time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S')
    aft_modify = (str(b) + '__' + str(random.random())).encode("utf-8")
    aft_name = hashlib.md5(aft_modify).hexdigest() + '__' + time_str + '__' + str(id) + '__' + b +'.json'

    file1 = RES_FOLDER
    if not os.path.isdir(file1):
        os.makedirs(file1)
    new_fpath = os.path.join(file1, aft_name)
    re_f = open(new_fpath, 'w', encoding='utf-8')
    json.dump(parse_res, re_f)
    return new_fpath


def Response_headers(content):
    resp = make_response(content)
    resp.headers['Access-Control-Allow-Headers'] = "x-requested-with,content-type,Authorization"
    resp.headers['Access-Control-Allow-Methods'] = "POST,GET,OPTIONS"
    resp.headers['Access-Control-Allow-Origin'] = '*'
    return resp


# 修改图片大小
def resize_img(img1, img2):
    """
    将图片进行压缩，两个地址相同，均为图片的绝对地址，目前该函数弃用，图片不再进行压缩
    :param img1: D://upload/23.png
    :param img2: D://upload/23.png
    :return:
    """
    im = Image.open(img1)
    if im.size[0] > 4000:  # 2020/3/20+
        ratio = 4000 / im.size[0]
        thumb = im.resize((int(im.size[0] * ratio), int(im.size[1] * ratio)), Image.ANTIALIAS)
        thumb.save(img2)


def img_rep(cont):
    """
    获取文本中含有的本地图片流
    :param cont:文本
    :return: 图片流
    """
    import base64
    img_stream = ''
    all_photo = re.findall("<img src=\"([^\"]+?)\".*?/>", cont)
    if all_photo:
        for src in all_photo:
            img_path = src.split(" ")[0]
            img_local_path = "F:/word_uploads/" + img_path.split("ser_static/")[1].replace("\"", "")
            print(img_local_path)
            with open(img_local_path, 'rb') as img_f:
                img_stream = img_f.read()
                img_stream = base64.b64encode(img_stream)
            cont = cont.replace(src, "")
            requests.post(img_path)
    return cont


def return_img_stream(img_local_path):
    """
    工具函数:
    获取本地图片流
    :param img_local_path:文件单张图片的本地绝对路径
    :return: 图片流
    """
    import base64
    img_stream = ''
    with open(img_local_path, 'rb') as img_f:
        img_stream = img_f.read()
        img_stream = base64.b64encode(img_stream).decode()

    return img_stream