#!/usr/bin/env/python
# -*- coding:utf-8 -*-
import os
import re
import json
import datetime
import time
import random
import hashlib
# import traceback
import requests
from retrying import retry
from PIL import Image
from flask import make_response
from func_timeout import func_set_timeout
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
# from photo_upload import upload_replace_image, upload_img_to_ucloud
from photo_upload_qcloud import upload_replace_image, upload_img_to_ucloud
from structure.structure_mian import WordParseStructure
RES_FOLDER = r'D:\zwj\word_non-temp_paper_structure\res_folder'
server_ip = "49.233.23.58"
server_file_port = "11088"
wordbin_path = r"D:\word_bin\ConsoleApplication1.exe"
def pin(dirpath):
from xpinyin import Pinyin
pinyin_converter = Pinyin()
need_modify = os.path.basename(dirpath)
res = os.path.dirname(dirpath)
aft_modify = pinyin_converter.get_pinyin(need_modify, '_')
aft_modify = re.sub(r"[(())+\-]", "", str(aft_modify))
aft_modify = re.sub(r"\s", "", str(aft_modify))
b, h = str(aft_modify).split(".")
now_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S')
aft_modify = str(b) + '__' + str(random.random())
if isinstance(aft_modify, str):
aft_modify = aft_modify.encode("utf-8")
md = hashlib.md5()
md.update(aft_modify)
aft_name = str(md.hexdigest() + '__' + time_str + '.' + h)
print("aft_name-----", aft_name)
bef = os.path.join(res, need_modify)
aft = os.path.join(res, aft_name)
os.rename(bef, aft)
return aft
# 生成html工具 wordbin
@retry(stop_max_attempt_number=2, wait_fixed=1) # 最大重试2次,2次全部报错,才会报错
def call_c_shape(doc_file):
res = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=70)
return res
def start_word2html_app(kill_mathtype=True):
if kill_mathtype:
os.system("taskkill /f /im MathType.exe")
os.system("taskkill /f /im WINWORD.EXE")
os.system("taskkill /f /im ConsoleApplication1.exe")
os.system("start {}".format(wordbin_path)) # start 在新窗口中打开
def check_fault_pid():
command1 = 'tasklist /fi "imagename eq WINWORD.EXE"'
command2 = 'tasklist /fi "imagename eq WerFault.exe"'
r1 = os.popen(command1)
info1 = r1.read() # 读取命令行的输出到一个list
if str(info1).strip() != "信息: 没有运行的任务匹配指定标准。":
print("++++出现office word 宏提醒,开始kill ++++")
start_word2html_app()
return 1
else:
print("-----没有word问题报告弹窗------")
r2 = os.popen(command2)
info2 = r2.read()
if str(info2).strip() != "信息: 没有运行的任务匹配指定标准。":
print("++++出现《问题报告》弹窗,开始kill ++++")
os.system("taskkill /f /im WerFault.exe")
start_word2html_app()
return 1
else:
print("-----没有wordbin问题报告弹窗------")
return 0
def get_html(doc_file): # doc_file:文件绝对路径名
"""调wordbin获取html文件"""
stime2 = time.time()
try:
r = call_c_shape(doc_file)
# executor1 = ProcessPoolExecutor(3) # wordbin不支持多线程
# word2html = executor1.submit(call_c_shape, doc_file)
# word2html = word2html.result()
# r = word2html.text
# executor1.shutdown(wait=True)
if r.text == 4:
check_fault_pid()
return "html文件生成失败", 0
except:
is_kill = check_fault_pid()
if not is_kill:
start_word2html_app(kill_mathtype=True)
return "试卷格式有问题", 0
etime2 = time.time()
try:
html = open(doc_file.replace(".docx", "_clean.html").replace(".doc", "_clean.html"), 'r', encoding="utf-8")
return html.read(), etime2 - stime2
except:
return "试卷格式有问题", etime2 - stime2 # 也可能超时
@func_set_timeout(120)
def parse_word(doc_file, filename_root, flag=0, sid=0):
print("-------解析的文件名为: {} -------".format(doc_file))
html, wordbin_time = get_html(doc_file)
print('解析中wordbin服务时间:', wordbin_time)
img_upload_time = 0
if html in ["html文件生成失败", "试卷格式有问题"]:
res = {"errcode": 1,
"errmsgs": "word读取失败。1.请确定试卷版面格式是否为常见的试卷排版格式,删除不规范格式\n2.请尝试将内容粘贴复制到一份空白word再上传\n"
"3.若上传文档题目太多,请分两次上传\n4.重点检查图片(设为嵌入式)和换行"}
paper_type = "wordbin报错,未知"
# log_f.write("\n----{}解析中wordbin服务时间:{}".format(file_name, wordbin_time))
else:
num = int(os.path.split(os.path.split(doc_file)[0])[1]) # word文件所在文件夹
# todo if flag==1 --> upload then upload image and replaced online address; else static
# http://zxhx-test + .cn-bj.ufileos.com + / + teacher/uploadfiles/wording + / 52 / 2020/04/21 + / + 5e9ea1ec2e28f.png"""
# flag存在時,sid必存在,上傳圖片到綫上
# print('flag:', flag)
images_url = ""
put_key_list, localfile_list = [], []
if str(flag) == "1":
# 图片替换
data = upload_replace_image(filename_root, sid, html)
if isinstance(data, dict): # 图片替换失败
print("图片替换线上地址失败")
res, paper_type = data, "图片替换线上地址失败,未知"
return res, wordbin_time, paper_type, img_upload_time
else:
images_url, put_key_list, localfile_list = data
else:
# html = re.sub(r':{}".format(etime3 - stime3))
except:
res = {"errcode": 1,
"errmsgs": "解析失败。1.请确定试卷版面格式是否为常见的试卷排版格式,删除不规范格式\n"
"2.若上传文档题目太多,请分两次上传\n3.重点检查图片(设为嵌入式)和换行"}
paper_type = "解析报错,未知"
print("\n+++++++++解析结果结束++++++++++\n")
return res, wordbin_time, paper_type, img_upload_time
def save_post_file(parse_res, fname, id):
"""保存回调的数据解析结果"""
b, h = str(os.path.basename(fname)).split(".")
now_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S')
aft_modify = (str(b) + '__' + str(random.random())).encode("utf-8")
aft_name = hashlib.md5(aft_modify).hexdigest() + '__' + time_str + '__' + str(id) + '__' + b +'.json'
file1 = RES_FOLDER
if not os.path.isdir(file1):
os.makedirs(file1)
new_fpath = os.path.join(file1, aft_name)
re_f = open(new_fpath, 'w', encoding='utf-8')
json.dump(parse_res, re_f)
return new_fpath
def Response_headers(content):
resp = make_response(content)
resp.headers['Access-Control-Allow-Headers'] = "x-requested-with,content-type,Authorization"
resp.headers['Access-Control-Allow-Methods'] = "POST,GET,OPTIONS"
resp.headers['Access-Control-Allow-Origin'] = '*'
return resp
# 修改图片大小
def resize_img(img1, img2):
"""
将图片进行压缩,两个地址相同,均为图片的绝对地址,目前该函数弃用,图片不再进行压缩
:param img1: D://upload/23.png
:param img2: D://upload/23.png
:return:
"""
im = Image.open(img1)
if im.size[0] > 4000: # 2020/3/20+
ratio = 4000 / im.size[0]
thumb = im.resize((int(im.size[0] * ratio), int(im.size[1] * ratio)), Image.ANTIALIAS)
thumb.save(img2)
def img_rep(cont):
"""
获取文本中含有的本地图片流
:param cont:文本
:return: 图片流
"""
import base64
img_stream = ''
all_photo = re.findall("", cont)
if all_photo:
for src in all_photo:
img_path = src.split(" ")[0]
img_local_path = "F:/word_uploads/" + img_path.split("ser_static/")[1].replace("\"", "")
print(img_local_path)
with open(img_local_path, 'rb') as img_f:
img_stream = img_f.read()
img_stream = base64.b64encode(img_stream)
cont = cont.replace(src, "")
requests.post(img_path)
return cont
def return_img_stream(img_local_path):
"""
工具函数:
获取本地图片流
:param img_local_path:文件单张图片的本地绝对路径
:return: 图片流
"""
import base64
img_stream = ''
with open(img_local_path, 'rb') as img_f:
img_stream = img_f.read()
img_stream = base64.b64encode(img_stream).decode()
return img_stream