123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524 |
- import datetime
- import json
- import os
- import random
- import time
- import traceback
- from glob import glob
- from pprint import pprint
- from time import sleep
- from multiprocessing import Process, Queue
- from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
- import requests
- from bottle import redirect
- from bottle import request
- from bottle import route, run
- from bottle import static_file
- from bottle import error
- from func_timeout import func_set_timeout
- import config
- from parse import Mathtest
- from utils import get_dir_next_num
- import re
- import logging
- from ufile import config as ufile_config
- from ufile import logger as img_log
- from ufile import filemanager
- import hashlib
- logger = logging.getLogger(__name__)
- logger.setLevel(level=logging.INFO)
- log_dir = '../logs'
- try:
- os.mkdir(log_dir)
- except:
- pass
- log_file = os.path.join(log_dir, 'parse_log.txt')
- # log_file = os.path.join(log_dir, 'parse_time.txt')
- handler = logging.FileHandler(log_file, mode='a', encoding='utf-8', delay=True)
- handler.setLevel(logging.INFO)
- formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- handler.setFormatter(formatter)
- logger.addHandler(handler)
- logger.info("Start print log")
- # logger.debug("Do something")
- # logger.warning("Something maybe fail.")
- # logger.info("Finish")
- '''
- errcode=0成功
- errcode=1失败
- errmsg是具体的失败消息
- '''
- # 定义上传路径
- save_path = '../upload'
- if not os.path.isdir(save_path):
- os.mkdir(save_path)
- # todo 本地替換圖片url,flag=0
- src_pat = re.compile(r'src\s*=\s*"files')
- public_key = 'ucloudyunkaopei@outlook.com13615403931104805307'
- private_key = 'bcfd5bb66ca527c9be9fd7f3e784fbfc90c4bba5'
- image_upload_log = '../logs/image_log.txt' # 图片上传日志
- img_log.set_log_file(image_upload_log)
- addr = '.cn-bj.ufileos.com' # 后缀拼接
- ufile_config.set_default(uploadsuffix=addr)
- # todo 綫上正式環境為'zxhx'
- # public_bucket = 'zxhx' # 公共空间名称
- # todo 綫上測試環境為'zxhx-test'
- public_bucket = 'zxhx-test'
- # private_bucket = '' # 私有空间名称
- @route('/')
- def index():
- return redirect("/upload")
- #
- # @route('/hello')
- # def hello():
- # return "hello"
- # 文件上传的HTML模板,这里没有额外去写html模板了,直接写在这里,方便点吧
- @route('/upload')
- def upload():
- return '''
- <html>
- <head>
- </head>
- <body>
- <form action="/upload" method="post" enctype="multipart/form-data">
- <input type="file" name="mydata" />
- <input type="submit" value="Upload" />
- </form>
- </body>
- </html>
- '''
- def call_c_shape(doc_file):
- r = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=config.call_c_shape_time_out)
- return r
- def start_word2html_app(kill_mathtype=True):
- if kill_mathtype:
- os.system("taskkill /f /im MathType.exe")
- os.system("taskkill /f /im WINWORD.EXE")
- os.system("taskkill /f /im ConsoleApplication1.exe")
- os.system("start {}".format(config.word2html_exe)) # start 在新窗口中打开
- def check_pid():
- command = 'tasklist /fi "imagename eq WINWORD.EXE"'
- r = os.popen(command)
- info = r.read() # 读取命令行的输出到一个list
- # print(info)
- if str(info).strip() != "信息: 没有运行的任务匹配指定标准。":
- print("++++再补一刀++++")
- start_word2html_app()
- else:
- print("-----office is killed------")
- @func_set_timeout(90)
- def parse_word(doc_file, filename_root, flag=0, sid=0):
- time4 = time.time()
- try:
- r = call_c_shape(doc_file)
- # executor1 = ProcessPoolExecutor(3)
- # word2html = executor1.submit(call_c_shape, doc_file)
- # word2html = word2html.result()
- # r = word2html.text
- # executor1.shutdown(wait=True)
- except:
- r = None
- start_word2html_app()
- print("+++++++++++++++time out of read word++++++++++++++++++")
- time44 = time.time() - time4
- logger.info("---word2html_time---==>" + str(time44))
- if r is None or str(r.text) == "4":
- # 再次确认office有没有杀死,没有则再补一刀
- check_pid()
- return {"errcode": 1,
- "errmsg": "word读取失败。请尝试:\n 1.请将word中图片改为嵌入式,然后上传。\n 2. 请将word题目拆分为多个word上传。\n 3.请将word中内容复制到新word上传。"}
- print("+++++++word to html, r={}, r.text={}++++++++".format(r, r.text))
- clean_html_file = os.path.splitext(doc_file)[0] + "_clean.html"
- if str(r.text) == "4" or not os.path.isfile(clean_html_file):
- # 再次确认office有没有杀死,没有则再补一刀
- check_pid()
- return {"errcode": 1,
- "errmsg": "word读取失败。请尝试:\n 1.请将word中图片改为嵌入式,然后上传。\n 2. 请将word题目拆分为多个word上传。\n 3.请将word中内容复制到新word上传。"}
- with open(clean_html_file, "r", encoding="utf-8") as f:
- html = f.read()
- num = int(os.path.split(os.path.split(doc_file)[0])[1])
- ip = config.external_ip
- # todo if flag==1 --> upload then upload image and replaced online address; else static
- # http://zxhx-test + .cn-bj.ufileos.com + / + teacher/uploadfiles/wording + / 52 / 2020/04/21 + / + 5e9ea1ec2e28f.png"""
- # flag存在時,sid必存在,上傳圖片到綫上
- if str(flag) == "1":
- print("+++++++++开始上传图片到Ucloud,并替换成线上地址++++++++++")
- data = upload_replace_image(filename_root, sid, html)
- if isinstance(data, dict):
- return data
- html, put_key_list, localfile_list = data
- # 直接解析没有图片上传
- if not localfile_list:
- pass
- else:
- # TODO 一个进程解析,一个进程上传
- time3 = time.time()
- executor1 = ProcessPoolExecutor(5)
- executor1.map(upload_img_to_ucloud, zip(put_key_list, localfile_list))
- executor1.shutdown(wait=True)
- time33 = time.time() - time3
- logger.info("---img_upload_time==>:{}".format(str(time33)))
- else:
- html = src_pat.sub(r'src="http://{}:{}/{}/{}/files'.format(ip, config.file_port, "static", num), html)
- m = Mathtest(html)
- return m.tojson()
- def get_md5(image):
- """
- 由于hash不处理unicode编码的字符串(python3默认字符串是unicode)
- 所以这里判断是否字符串,如果是则进行转码
- 初始化md5、将image_name进行加密、然后返回加密字串
- """
- image_name, image_type = str(image).split(".")
- image_name = str(image_name) + str(time.time()) + str(random.random())
- if isinstance(image_name, str):
- image_name = image_name.encode("utf-8")
- md = hashlib.md5()
- md.update(image_name)
- # a = time.time()
- # b = random.random()
- return str(md.hexdigest()) + "." + str(image_type)
- # todo 上传图片进程
- def upload_img_to_ucloud(param_ucloud):
- put_key, localfile = param_ucloud
- putufile_handler = filemanager.FileManager(public_key, private_key)
- # 普通上传文件至公共空间
- ret, resp = putufile_handler.putfile(public_bucket, put_key, localfile, header=None)
- assert resp.status_code == 200
- def upload_replace_image(filename_root, sid, html):
- return_error = {"errcode": 1,
- "errmsg": "word图片上传失败。"}
- daytime = datetime.datetime.now().strftime('/%Y/%m/%d/')
- image_path = filename_root + "/files"
- # todo 判断试卷是否含有图片,如果有就替换上传,没有就不处理
- judge_file = os.path.isdir(image_path)
- if judge_file:
- image_number = re.findall(r'<img\s*src\s*=\s*"files/image', str(html))
- local_images_path_list = os.listdir(image_path) # 本地图片文件名
- local_images_path_list = list(filter(
- lambda x: str(x).endswith(".png") or str(x).endswith(".gif") or str(x).endswith(".jpeg") or str(x).endswith(
- ".jpg"), local_images_path_list))
- if len(image_number) != len(local_images_path_list):
- return return_error
- else:
- # 從大到小把圖片进行排序
- try:
- local_images_path_list.sort(key=lambda x: int(re.search(r"image(\d+)\.[pngifje]+", str(x)).group(1)))
- except:
- return return_error
- logger.info("local_images_path_list==>{}".format(str(local_images_path_list)))
- put_key_list = []
- localfile_list = []
- if local_images_path_list:
- try:
- for i, img in enumerate(local_images_path_list, start=1): # 所有的图片【image1.png,image2.png,。。。】
- src_pat2 = re.compile(r'<img\s*src\s*=\s*"files/image{}\.png"'.format(i))
- src_pat3 = re.compile(r'<img\s*src\s*=\s*"files/image{}\.gif"'.format(i))
- src_pat4 = re.compile(r'<img\s*src\s*=\s*"files/image{}\.jpeg"'.format(i))
- src_pat5 = re.compile(r'<img\s*src\s*=\s*"files/image{}\.jpg"'.format(i))
- if str(img).endswith("png"):
- pat = src_pat2
- # re_list.append(src_pat2)
- elif str(img).endswith("gif"):
- pat = src_pat3
- # re_list.append(src_pat3)
- elif str(img).endswith("jpeg"):
- pat = src_pat4
- # re_list.append(src_pat4)
- elif str(img).endswith("jpg"):
- pat = src_pat5
- # re_list.append(src_pat5)
- # todo 上传线上,并替换线上图片
- localfile = image_path + "/{}".format(img)
- localfile_list.append(localfile)
- # todo 图片压缩
- # resize_img(localfile, localfile)
- hash_img = get_md5(img)
- # 上传文件在空间中的名称
- put_key = "teacher/uploadfiles/wording/" + str(sid) + str(daytime) + str(hash_img)
- put_key_list.append(put_key)
- # html替换为线上的地址
- online_image_url = "http://" + str(public_bucket) + str(addr) + "/" + str(put_key)
- html = pat.sub(r'<img src={}'.format(online_image_url), str(html))
- return html, put_key_list, localfile_list
- except:
- return return_error
- else:
- return return_error
- else:
- return html, [], []
- # todo 改为多线程去执行
- def multi_parse(parma):
- filename, filename_root, callback_url, flag, sid = parma
- # print("+=+++=", filename, callback_url, filename_root, flag, sid)
- try:
- time2 = time.time()
- res = parse_word(filename_root, filename, flag, sid)
- time22 = time.time() - time2
- logger.info("---parse_word_time==>:{}".format(str(time22)))
- # print("解析结果为")
- # pprint(res)
- print("------------parse is successful---------------")
- except:
- print(traceback.print_exc())
- print("+++++++++++++++have callback_url,but time out of parse_word++++++++++++++++++")
- res = {"time_out": 90,
- "id": 0,
- "errcode": 1,
- "errmsg": "word读取失败。请尝试:\n 1.请将word中图片改为嵌入式,然后上传。\n 2. 请将word题目拆分为多个word上传。\n 3.请将word中内容复制到新word上传。"}
- # print("解析结果为:\n")
- # pprint(res)
- print("------------parse is fail---------------")
- headers = {'Content-Type': 'application/json', }
- try:
- response = requests.post(callback_url,
- # json=res, # 可以,但是会进行转义
- data=json.dumps(res, ensure_ascii=False).encode("utf-8"),
- headers=headers,
- timeout=10
- )
- print("callback response {}".format(response.status_code))
- except:
- print("error callback")
- def step(filename, filename_root, callback_url, flag, sid):
- executor = ThreadPoolExecutor(max_workers=10)
- executor.submit(multi_parse, (filename, filename_root, callback_url, flag, sid))
- def read(q):
- while True:
- print("+++++++++", q.qsize())
- content = q.get(True) # todo huancun
- filename, filename_root, callback_url, flag, sid = content # todo huancun
- logger.info("\n\n*********parse is action***********\n")
- logger.info('++++Get filename={}, filename_root={}, callback_url={}, flag ={},sid ={} from queue\n\n'
- .format(filename, filename_root, callback_url, flag, sid))
- # 单线程解析
- multi_parse(content)
- # 文件上传,overwrite=True为覆盖原有的文件,
- # 如果不加这参数,当服务器已存在同名文件时,将返回“IOError: File exists.”错误
- @route('/upload', method='POST')
- def do_upload():
- logger.info("==request.POST.dict==>{}".format(request.POST.dict))
- if "callback_url" in request.POST.dict:
- callback_url = request.POST.dict["callback_url"][0]
- else:
- callback_url = ""
- time1 = time.time()
- upload = request.files.get('mydata')
- # todo add flag and sid
- flag = request.POST.dict.get('flag', [0])[0]
- sid = request.POST.dict.get('sid', [0])[0]
- # logger.info("------flag==>: {}".format(str(flag)))
- # logger.info("------sid==>: {}".format(str(sid)))
- # logger.info("------mydata==>: {}".format(str(upload)))
- # logger.info("------callback_url==>: {}".format(str(callback_url)))
- if os.path.splitext(upload.raw_filename)[1] not in [".doc", ".docx"]:
- return "only accept .doc .docx files"
- num_str = str(get_dir_next_num(save_path))
- cur_save_path = os.path.join(save_path, num_str)
- os.mkdir(cur_save_path)
- upload.raw_filename = num_str + os.path.splitext(upload.raw_filename)[1]
- upload.save(cur_save_path, overwrite=True) # 把文件保存到save_path路径下
- filename = os.path.join(cur_save_path, upload.raw_filename)
- filename = os.path.abspath(filename)
- filename_root = os.path.abspath(cur_save_path)
- logger.info("---filename_root==>:{}".format(str(filename_root)))
- logger.info("---filename==>:{}".format(str(filename)))
- time11 = time.time() - time1
- logger.info("---save_filename_time==>:{}".format(str(time11)))
- # 开一个进程对filename 进行解析
- if callback_url:
- q_parse_file.put([filename_root, filename, callback_url, flag, sid])
- print("-----当前还有{}-----".format(q_parse_file.qsize() - 1))
- return json.dumps({
- "errcode": 0,
- "errmsg": "OK",
- "docsbefore": q_parse_file.qsize() - 1,
- }, ensure_ascii=False).encode("utf-8")
- else:
- try:
- res = parse_word(filename, None)
- # print("\n解析结果===>\n{}\n\n".format(res))
- print("------------parse is successful---------------")
- except:
- print("+++++++++++++++time out of parse_word++++++++++++++++++")
- res = {"time_out": 3,
- "errcode": 1,
- "errmsg": "word读取失败。请尝试:\n 1.请将word中图片改为嵌入式,然后上传。\n 2. 请将word题目拆分为多个word上传。\n 3.请将word中内容复制到新word上传。"}
- return json.dumps(res, ensure_ascii=False, indent=4).encode("utf-8")
- from PIL import Image
- def resize_img(img1, img2):
- im = Image.open(img1)
- if im.size[0] > 1000:
- ratio = 1000 / im.size[0]
- thumb = im.resize((int(im.size[0] * ratio), int(im.size[1] * ratio)), Image.ANTIALIAS)
- thumb.save(img2)
- @route('/static/<filepath:path>', method='GET')
- def server_static(filepath):
- # 5/files/image6.png
- # print("static_file: ^{}$".format(filepath))
- # logger.info("static_file: ^{}$".format(filepath))
- file = os.path.join(save_path, filepath)
- if "image" in file:
- resize_img(file, file)
- return static_file(filepath, root=save_path)
- static_image_pat = re.compile(
- r'(static/\d+/files/image\d+\.(webp|bmp|pcx|tiff|gif|jpeg|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|png|hdri|raw|wmf|flic|emf|ico))')
- @error(404)
- def error404(error):
- request_str = str(request)
- # '<LocalRequest: GET http://10.19.150.196:18084/favicon.ico>'
- if 'favicon.ico' in request_str:
- return 'Nothing here, sorry'
- # request_str = '<LocalRequest: GET http://123.59.151.182:18082/%22http%3A/123.59.151.182%3A18083/static/83/files/image2.png/%22>'
- m = static_image_pat.search(request_str)
- if m:
- filepath = m.group(1) # static/83/files/image2.png
- filepath = filepath.replace("static", "upload")
- if os.path.isfile(filepath):
- return "Nothing here, sorry"
- print("error404: remote_addr={}, request={}".format(str(request.remote_addr), request_str))
- # logger.error("error404: remote_addr={}, request={}".format(str(request.remote_addr), request_str))
- return 'your ip: {}<br>' \
- 'your request: ^{}$<br>' \
- 'Nothing here, sorry<br>'.format(str(request.remote_addr), request_str.replace("<", "").replace(">", ""))
- if __name__ == "__main__":
- # 解析文件的队列
- prs = []
- q_parse_file = Queue()
- start_word2html_app(kill_mathtype=True)
- # 读取解析进程
- # for i in range(5): # will call c shape word2html interface
- pr = Process(target=read, args=(q_parse_file,))
- pr.start()
- prs.append(pr)
- print("parse pid:", pr.pid)
- # sleep(2)
- # # 图片供应进程
- # pr = Process(target=run, args=(),
- # kwargs={"host": config.internal_ip, "port": config.file_port, "server": "tornado"})
- # pr.start()
- # prs.append(pr)
- # print("image pid:", pr.pid)
- # sleep(2)
- # 响应请求
- print("main pid:", os.getpid())
- print("main port:", config.server_port)
- # delete files
- # print("\n--------delete file process is start--------\n")
- # dojob()
- run(host=config.internal_ip, port=config.server_port, server="tornado")
|