#!/usr/bin/env/python # -*- coding:utf-8 -*- import logging from flask import Flask, render_template from flask import request, redirect, Response from flask_cors import * from multiprocessing import Process, Queue from server_tools import * import pandas as pd import base64 from six.moves import urllib logger = logging.getLogger(__name__) logger.setLevel(level=logging.INFO) log_file = os.path.join(r'./logs', 'parse_log.txt') # 日志地址 handler = logging.FileHandler(log_file, mode='a', encoding='utf-8', delay=True) handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) # console = logging.StreamHandler() # console.setLevel(logging.INFO) # console.setFormatter(formatter) logger.addHandler(handler) # logger.addHandler(console) app = Flask(__name__) app.debug = True CORS(app, supports_credentials=True) ''' errcode=0成功 errcode=1失败 errmsg是具体的失败消息 ''' # 定义上传路径 UPLOAD_FOLDER = r'D:\zwj\word_uploads' # wordbin服务生成文件路径 if not os.path.isdir(UPLOAD_FOLDER): os.makedirs(UPLOAD_FOLDER) # -------------------------------------------------------------- @app.route('/') def index(): return redirect("/word_parse") # 文件上传的HTML模板,这里没有额外去写html模板了,直接写在这里,方便点吧 @app.route('/word_parse') def upload(): return '''

欢迎使用理科试题word解析

''' def read(cache): while True: filename, filename_root, callback_url, flag, sid = cache.get(True) # 缓存 print("\n+++++++++++++++还有{}缓存解析+++++++++++++++\n".format(cache.qsize() - 1)) print("\n------callback_url===> {} -------\n".format(callback_url)) logger.info('Get filename={}, filename_root={}, callback_url={}, is_cloud ={},sid ={} from queue\n' .format(filename, filename_root, callback_url, flag, sid)) # 单线程解析 stime1 = time.time() try: res, wordbin_time, paper_type, img_upload_time = parse_word(filename, filename_root, flag, sid) etime1 = time.time() logger.info("\n----wordbin服务所占时间:{}".format(wordbin_time)) logger.info("\n----整个解析所占时间:{}".format(etime1 - stime1)) logger.info("\n----该份试卷的格式类型:{}".format(paper_type)) if img_upload_time: logger.info("\n----图片上传时间:{}".format(img_upload_time)) except: # print(traceback.print_exc()) print("--------have callback_url,but time out of parse_word-------") res = {"time_out": 90, "id": 0, "errcode": 1, "errmsgs": "解析超时,请重试"} # "errmsg": "word读取失败。请尝试:\n 1.请将word中图片改为嵌入式,然后上传。\n 2. 请将word题目拆分为多个word上传。\n 3.请将word中内容复制到新word上传。" print("------------parse is fail---------------") try: headers = {'Content-Type': 'application/json', } # print(json.dumps(res, ensure_ascii=False).encode("utf-8")) r = requests.post(callback_url, # json=res, # 可以,但是会进行转义 data=json.dumps(res, ensure_ascii=False), # 文件开头有编码显示 headers=headers) # timeout=10 print("\n------post 回调地址===> {} -------\n".format(r.status_code)) print("\n------post 返回结果===> {} -------\n".format(r.text)) except TimeoutError: print("回调超时") except Exception as e: print(e) print("回调出错") if "items" in res and res["items"]: if re.search(r"wid/(\d+)/?", str(callback_url)): upload_id = re.search(r"wid/(\d+)/?", str(callback_url)).group(1) save_fname = save_post_file(res, filename, upload_id) # 保存返回的解析结果 else: save_fname = save_post_file(res, filename, '0000') logger.info("\n----{}解析完保存的数据文件名:\n{}".format(os.path.basename(filename), save_fname)) # noinspection PyTypeChecker @app.route('/word_parse', methods=["POST"]) def do_upload(): """ 该接口函数专门用来接收word和参数,并保存到本地,将保存后的数据放入缓存中; 参数关键字为:mydata,callback_url,flag,sid # 文件上传,overwrite=True为覆盖原有的文件,是bottle的用法f.raw_filename # 如果不加这参数,当服务器已存在同名文件时,将返回“IOError: File exists.”错误 :return: """ img_file_count = 0 if os.listdir(UPLOAD_FOLDER): img_file_count = max([int(i) for i in os.listdir(UPLOAD_FOLDER)]) + 1 logger.info("==request.POST.dict==>{}\n".format(request.form.to_dict())) # php传过来的文件 callback_url = request.form.get('callback_url', "") # flag = request.form.get('is_cloud', [0])[0] # sid = request.form.get('sid', [0])[0] flag = request.form.get('is_cloud', "0") # word中的图片是否上传云端 sid = request.form.get('sid', "0") # 学管端、教师端参数 # word文件接收 # 旧接口::整个word文件传过来,主要应用在云题库端上传 try: upfile = request.files.get('mydata') # 题库上传 print("\n------接受文件名==> {}-------\n".format(os.path.basename(upfile.filename))) if os.path.splitext(upfile.filename)[1] not in [".doc", ".docx"]: return "only accept .doc .docx files" file_root1 = os.path.join(UPLOAD_FOLDER, str(img_file_count)) # 保存文件所在位置的上一层根目录 if not os.path.isdir(file_root1): os.makedirs(file_root1) upfile.filename = str(img_file_count) + os.path.splitext(upfile.filename)[1] filename = os.path.join(file_root1, upfile.filename) # 文件的绝对路径, 不会含有汉字 upfile.save(filename) # 保存到当地 # ----------------------------------------------------------- except: # 新接口mydata传过来的是json # j_data = json.loads(request.json.get('mydata')) l_data = request.form.get('mydata') print("l_data:", l_data) upfile_url = json.loads(l_data)[0]["file_url"] print("文件地址:", upfile_url) print("\n------接受文件名==> {}-------\n".format(os.path.basename(upfile_url))) if os.path.splitext(upfile_url)[1] not in [".doc", ".docx"]: return "only accept .doc .docx files" file_root1 = os.path.join(UPLOAD_FOLDER, str(img_file_count)) # 保存文件所在位置的上一层根目录 if not os.path.isdir(file_root1): os.makedirs(file_root1) # 根据url地址下载word文件 # r_file = requests.get(upfile_url) # if r_file.status_code != 200: # print('下载异常') # with open(filename, "wb") as f: # 将文件下载到本地 # f.write(r_file.content) filename = str(img_file_count) + os.path.splitext(upfile_url)[1] filename = os.path.join(file_root1, filename) # 文件的绝对路径, 不会含有汉字 try: urllib.request.urlretrieve(upfile_url, filename) except: return "word文件下载失败,请重新上传" print('------word Successfully downloaded-----') # 开一个进程对filename 进行解析 if callback_url: cache_file.put([filename, file_root1, callback_url, flag, sid]) print("-----当前缓存还有{}-----".format(int(cache_file.qsize()) - 1)) return json.dumps({"errcode": 0, "errmsg": "OK", "docsbefore": int(cache_file.qsize()) - 1}, ensure_ascii=False) else: # 没有回调地址时,简易网页上传 stime1 = time.time() try: res, wordbin_time, paper_type, img_upload_time = parse_word(filename, None) etime1 = time.time() print("------------简易接口上传,解析成功---------------") logger.info("\n------------简易接口上传,解析成功---------------") logger.info("\n开始解析文件:{}".format(filename)) logger.info("\n----wordbin服务所占时间:{}".format(wordbin_time)) logger.info("\n----整个解析所占时间:{}".format(etime1 - stime1)) logger.info("\n----该份试卷的格式类型:{}".format(paper_type)) except: print("------------简易接口上传,解析失败---------------") res = {"time_out": 3, "errcode": 1, "errmsg": "word读取失败。请尝试:\n 1.请确定试卷版面格式是否为常见的试卷排版格式,删除不规范格式;\n" "2.请将word中内容复制到新word上传;\n 3. 请将word题目拆分为多个word上传。" "\n 4.请检查图片格式,将word中图片改为嵌入式后再上传。"} save_fname = save_post_file(res, filename, '0011') logger.info("\n----{}解析完保存的数据文件名【简易上传】:\n{}\n".format(os.path.basename(filename), save_fname)) if "items" in res and res["items"]: # save_fname = save_post_file(res, filename, '0011') # logger.info("\n----{}解析完保存的数据文件名【简易上传】:\n{}\n".format(os.path.basename(filename), save_fname)) new_res = [] for r in res["items"]: dd = [r["item_id"], r["type"], r["stem"], r["key"], r["analysis"],r["text_errmsgs"]] if "options" in r: r["options"] = [chr(65+k)+'、'+ opt for k, opt in enumerate(r["options"])] dd[2] += "
===================
【选项】
" + ";
".join(r["options"]).replace("
;", ";") new_res.append(dd) resdf = pd.DataFrame(new_res) resdf.columns = ["题号", "题型", "题干", "答案", "解析"] pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('expand_frame_repr', True) # 允许换行 pd.set_option('display.width', -1) pd.set_option('display.max_colwidth', -1) # 自动最大列宽 # 如何将图片显示出来:图片属性格式要符合前端显示标准 return render_template('res.html', items=[resdf.to_html(header=True, index=False, escape=False, # 影响字符串格式 justify="center")], res_json=json.dumps(res, ensure_ascii=False) ) else: # resp = Response_headers(json.dumps(res, ensure_ascii=False).encode("utf-8")) # , indent=4 resp = json.dumps(res, ensure_ascii=False) return resp # @app.route('/static///', methods=["GET"]) # def server_static(a, b, filepath): @app.route('/ser_static/', methods=["GET"]) def ser_static(file_path): # endpoint的位置是函数接口名 """ 将本地图片供页面缓存机制调用,上传cloud时该函数不使用 :param file_path: 图片的本地绝对路径 :return: """ # filepath = request.args.get('q') # 针对/**/=?类型 file = os.path.join(UPLOAD_FOLDER, file_path) if "image" in file: resize_img(file, file) # resp = Response(base64.b64encode(open(file, 'rb').read()).decode()) # 仍无法显示图片 resp = Response(open(file, 'rb')) return resp # 单独在网页上输入图片地址时,用下面的模板渲染图片显示出来 # img_stream = return_img_stream(file) # return render_template('index.html', # img_stream=img_stream) # @app.template_filter("src_render") # def src_render(src_cont): # src_cont = src_cont.replace("<", "<").replace(">", ">") # return src_cont if __name__ == "__main__": # 解析文件的队列 cache_file = Queue() # start_word2html_app(kill_mathtype=True) # 读取解析进程 pr_parse = Process(target=read, args=(cache_file,)) pr_parse.start() print("parse pid:", pr_parse.pid) # sleep(2) # 图片供应进程 # pr_img = Process(target=(), args=(), # kwargs={"host": "192.168.1.140", "port": "18083", "server": "tornado"}) # pr_img.start() # print("image pid:", pr_img.pid) app.run(host="0.0.0.0", port=11088, threaded=True, debug=True)