math_server.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524
  1. import datetime
  2. import json
  3. import os
  4. import random
  5. import time
  6. import traceback
  7. from glob import glob
  8. from pprint import pprint
  9. from time import sleep
  10. from multiprocessing import Process, Queue
  11. from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
  12. import requests
  13. from bottle import redirect
  14. from bottle import request
  15. from bottle import route, run
  16. from bottle import static_file
  17. from bottle import error
  18. from func_timeout import func_set_timeout
  19. import config
  20. from parse import Mathtest
  21. from utils import get_dir_next_num
  22. import re
  23. import logging
  24. from ufile import config as ufile_config
  25. from ufile import logger as img_log
  26. from ufile import filemanager
  27. import hashlib
  28. logger = logging.getLogger(__name__)
  29. logger.setLevel(level=logging.INFO)
  30. log_dir = '../logs'
  31. try:
  32. os.mkdir(log_dir)
  33. except:
  34. pass
  35. log_file = os.path.join(log_dir, 'parse_log.txt')
  36. # log_file = os.path.join(log_dir, 'parse_time.txt')
  37. handler = logging.FileHandler(log_file, mode='a', encoding='utf-8', delay=True)
  38. handler.setLevel(logging.INFO)
  39. formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  40. handler.setFormatter(formatter)
  41. logger.addHandler(handler)
  42. logger.info("Start print log")
  43. # logger.debug("Do something")
  44. # logger.warning("Something maybe fail.")
  45. # logger.info("Finish")
  46. '''
  47. errcode=0成功
  48. errcode=1失败
  49. errmsg是具体的失败消息
  50. '''
  51. # 定义上传路径
  52. save_path = '../upload'
  53. if not os.path.isdir(save_path):
  54. os.mkdir(save_path)
  55. # todo 本地替換圖片url,flag=0
  56. src_pat = re.compile(r'src\s*=\s*"files')
  57. public_key = 'ucloudyunkaopei@outlook.com13615403931104805307'
  58. private_key = 'bcfd5bb66ca527c9be9fd7f3e784fbfc90c4bba5'
  59. image_upload_log = '../logs/image_log.txt' # 图片上传日志
  60. img_log.set_log_file(image_upload_log)
  61. addr = '.cn-bj.ufileos.com' # 后缀拼接
  62. ufile_config.set_default(uploadsuffix=addr)
  63. # todo 綫上正式環境為'zxhx'
  64. # public_bucket = 'zxhx' # 公共空间名称
  65. # todo 綫上測試環境為'zxhx-test'
  66. public_bucket = 'zxhx-test'
  67. # private_bucket = '' # 私有空间名称
  68. @route('/')
  69. def index():
  70. return redirect("/upload")
  71. #
  72. # @route('/hello')
  73. # def hello():
  74. # return "hello"
  75. # 文件上传的HTML模板,这里没有额外去写html模板了,直接写在这里,方便点吧
  76. @route('/upload')
  77. def upload():
  78. return '''
  79. <html>
  80. <head>
  81. </head>
  82. <body>
  83. <form action="/upload" method="post" enctype="multipart/form-data">
  84. <input type="file" name="mydata" />
  85. <input type="submit" value="Upload" />
  86. </form>
  87. </body>
  88. </html>
  89. '''
  90. def call_c_shape(doc_file):
  91. r = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=config.call_c_shape_time_out)
  92. return r
  93. def start_word2html_app(kill_mathtype=True):
  94. if kill_mathtype:
  95. os.system("taskkill /f /im MathType.exe")
  96. os.system("taskkill /f /im WINWORD.EXE")
  97. os.system("taskkill /f /im ConsoleApplication1.exe")
  98. os.system("start {}".format(config.word2html_exe)) # start 在新窗口中打开
  99. def check_pid():
  100. command = 'tasklist /fi "imagename eq WINWORD.EXE"'
  101. r = os.popen(command)
  102. info = r.read() # 读取命令行的输出到一个list
  103. # print(info)
  104. if str(info).strip() != "信息: 没有运行的任务匹配指定标准。":
  105. print("++++再补一刀++++")
  106. start_word2html_app()
  107. else:
  108. print("-----office is killed------")
  109. @func_set_timeout(90)
  110. def parse_word(doc_file, filename_root, flag=0, sid=0):
  111. time4 = time.time()
  112. try:
  113. r = call_c_shape(doc_file)
  114. # executor1 = ProcessPoolExecutor(3)
  115. # word2html = executor1.submit(call_c_shape, doc_file)
  116. # word2html = word2html.result()
  117. # r = word2html.text
  118. # executor1.shutdown(wait=True)
  119. except:
  120. r = None
  121. start_word2html_app()
  122. print("+++++++++++++++time out of read word++++++++++++++++++")
  123. time44 = time.time() - time4
  124. logger.info("---word2html_time---==>" + str(time44))
  125. if r is None or str(r.text) == "4":
  126. # 再次确认office有没有杀死,没有则再补一刀
  127. check_pid()
  128. return {"errcode": 1,
  129. "errmsg": "word读取失败。请尝试:\n 1.请将word中图片改为嵌入式,然后上传。\n 2. 请将word题目拆分为多个word上传。\n 3.请将word中内容复制到新word上传。"}
  130. print("+++++++word to html, r={}, r.text={}++++++++".format(r, r.text))
  131. clean_html_file = os.path.splitext(doc_file)[0] + "_clean.html"
  132. if str(r.text) == "4" or not os.path.isfile(clean_html_file):
  133. # 再次确认office有没有杀死,没有则再补一刀
  134. check_pid()
  135. return {"errcode": 1,
  136. "errmsg": "word读取失败。请尝试:\n 1.请将word中图片改为嵌入式,然后上传。\n 2. 请将word题目拆分为多个word上传。\n 3.请将word中内容复制到新word上传。"}
  137. with open(clean_html_file, "r", encoding="utf-8") as f:
  138. html = f.read()
  139. num = int(os.path.split(os.path.split(doc_file)[0])[1])
  140. ip = config.external_ip
  141. # todo if flag==1 --> upload then upload image and replaced online address; else static
  142. # http://zxhx-test + .cn-bj.ufileos.com + / + teacher/uploadfiles/wording + / 52 / 2020/04/21 + / + 5e9ea1ec2e28f.png"""
  143. # flag存在時,sid必存在,上傳圖片到綫上
  144. if str(flag) == "1":
  145. print("+++++++++开始上传图片到Ucloud,并替换成线上地址++++++++++")
  146. data = upload_replace_image(filename_root, sid, html)
  147. if isinstance(data, dict):
  148. return data
  149. html, put_key_list, localfile_list = data
  150. # 直接解析没有图片上传
  151. if not localfile_list:
  152. pass
  153. else:
  154. # TODO 一个进程解析,一个进程上传
  155. time3 = time.time()
  156. executor1 = ProcessPoolExecutor(5)
  157. executor1.map(upload_img_to_ucloud, zip(put_key_list, localfile_list))
  158. executor1.shutdown(wait=True)
  159. time33 = time.time() - time3
  160. logger.info("---img_upload_time==>:{}".format(str(time33)))
  161. else:
  162. html = src_pat.sub(r'src="http://{}:{}/{}/{}/files'.format(ip, config.file_port, "static", num), html)
  163. m = Mathtest(html)
  164. return m.tojson()
  165. def get_md5(image):
  166. """
  167. 由于hash不处理unicode编码的字符串(python3默认字符串是unicode)
  168. 所以这里判断是否字符串,如果是则进行转码
  169. 初始化md5、将image_name进行加密、然后返回加密字串
  170. """
  171. image_name, image_type = str(image).split(".")
  172. image_name = str(image_name) + str(time.time()) + str(random.random())
  173. if isinstance(image_name, str):
  174. image_name = image_name.encode("utf-8")
  175. md = hashlib.md5()
  176. md.update(image_name)
  177. # a = time.time()
  178. # b = random.random()
  179. return str(md.hexdigest()) + "." + str(image_type)
  180. # todo 上传图片进程
  181. def upload_img_to_ucloud(param_ucloud):
  182. put_key, localfile = param_ucloud
  183. putufile_handler = filemanager.FileManager(public_key, private_key)
  184. # 普通上传文件至公共空间
  185. ret, resp = putufile_handler.putfile(public_bucket, put_key, localfile, header=None)
  186. assert resp.status_code == 200
  187. def upload_replace_image(filename_root, sid, html):
  188. return_error = {"errcode": 1,
  189. "errmsg": "word图片上传失败。"}
  190. daytime = datetime.datetime.now().strftime('/%Y/%m/%d/')
  191. image_path = filename_root + "/files"
  192. # todo 判断试卷是否含有图片,如果有就替换上传,没有就不处理
  193. judge_file = os.path.isdir(image_path)
  194. if judge_file:
  195. image_number = re.findall(r'<img\s*src\s*=\s*"files/image', str(html))
  196. local_images_path_list = os.listdir(image_path) # 本地图片文件名
  197. local_images_path_list = list(filter(
  198. lambda x: str(x).endswith(".png") or str(x).endswith(".gif") or str(x).endswith(".jpeg") or str(x).endswith(
  199. ".jpg"), local_images_path_list))
  200. if len(image_number) != len(local_images_path_list):
  201. return return_error
  202. else:
  203. # 從大到小把圖片进行排序
  204. try:
  205. local_images_path_list.sort(key=lambda x: int(re.search(r"image(\d+)\.[pngifje]+", str(x)).group(1)))
  206. except:
  207. return return_error
  208. logger.info("local_images_path_list==>{}".format(str(local_images_path_list)))
  209. put_key_list = []
  210. localfile_list = []
  211. if local_images_path_list:
  212. try:
  213. for i, img in enumerate(local_images_path_list, start=1): # 所有的图片【image1.png,image2.png,。。。】
  214. src_pat2 = re.compile(r'<img\s*src\s*=\s*"files/image{}\.png"'.format(i))
  215. src_pat3 = re.compile(r'<img\s*src\s*=\s*"files/image{}\.gif"'.format(i))
  216. src_pat4 = re.compile(r'<img\s*src\s*=\s*"files/image{}\.jpeg"'.format(i))
  217. src_pat5 = re.compile(r'<img\s*src\s*=\s*"files/image{}\.jpg"'.format(i))
  218. if str(img).endswith("png"):
  219. pat = src_pat2
  220. # re_list.append(src_pat2)
  221. elif str(img).endswith("gif"):
  222. pat = src_pat3
  223. # re_list.append(src_pat3)
  224. elif str(img).endswith("jpeg"):
  225. pat = src_pat4
  226. # re_list.append(src_pat4)
  227. elif str(img).endswith("jpg"):
  228. pat = src_pat5
  229. # re_list.append(src_pat5)
  230. # todo 上传线上,并替换线上图片
  231. localfile = image_path + "/{}".format(img)
  232. localfile_list.append(localfile)
  233. # todo 图片压缩
  234. # resize_img(localfile, localfile)
  235. hash_img = get_md5(img)
  236. # 上传文件在空间中的名称
  237. put_key = "teacher/uploadfiles/wording/" + str(sid) + str(daytime) + str(hash_img)
  238. put_key_list.append(put_key)
  239. # html替换为线上的地址
  240. online_image_url = "http://" + str(public_bucket) + str(addr) + "/" + str(put_key)
  241. html = pat.sub(r'<img src={}'.format(online_image_url), str(html))
  242. return html, put_key_list, localfile_list
  243. except:
  244. return return_error
  245. else:
  246. return return_error
  247. else:
  248. return html, [], []
  249. # todo 改为多线程去执行
  250. def multi_parse(parma):
  251. filename, filename_root, callback_url, flag, sid = parma
  252. # print("+=+++=", filename, callback_url, filename_root, flag, sid)
  253. try:
  254. time2 = time.time()
  255. res = parse_word(filename_root, filename, flag, sid)
  256. time22 = time.time() - time2
  257. logger.info("---parse_word_time==>:{}".format(str(time22)))
  258. # print("解析结果为")
  259. # pprint(res)
  260. print("------------parse is successful---------------")
  261. except:
  262. print(traceback.print_exc())
  263. print("+++++++++++++++have callback_url,but time out of parse_word++++++++++++++++++")
  264. res = {"time_out": 90,
  265. "id": 0,
  266. "errcode": 1,
  267. "errmsg": "word读取失败。请尝试:\n 1.请将word中图片改为嵌入式,然后上传。\n 2. 请将word题目拆分为多个word上传。\n 3.请将word中内容复制到新word上传。"}
  268. # print("解析结果为:\n")
  269. # pprint(res)
  270. print("------------parse is fail---------------")
  271. headers = {'Content-Type': 'application/json', }
  272. try:
  273. response = requests.post(callback_url,
  274. # json=res, # 可以,但是会进行转义
  275. data=json.dumps(res, ensure_ascii=False).encode("utf-8"),
  276. headers=headers,
  277. timeout=10
  278. )
  279. print("callback response {}".format(response.status_code))
  280. except:
  281. print("error callback")
  282. def step(filename, filename_root, callback_url, flag, sid):
  283. executor = ThreadPoolExecutor(max_workers=10)
  284. executor.submit(multi_parse, (filename, filename_root, callback_url, flag, sid))
  285. def read(q):
  286. while True:
  287. print("+++++++++", q.qsize())
  288. content = q.get(True) # todo huancun
  289. filename, filename_root, callback_url, flag, sid = content # todo huancun
  290. logger.info("\n\n*********parse is action***********\n")
  291. logger.info('++++Get filename={}, filename_root={}, callback_url={}, flag ={},sid ={} from queue\n\n'
  292. .format(filename, filename_root, callback_url, flag, sid))
  293. # 单线程解析
  294. multi_parse(content)
  295. # 文件上传,overwrite=True为覆盖原有的文件,
  296. # 如果不加这参数,当服务器已存在同名文件时,将返回“IOError: File exists.”错误
  297. @route('/upload', method='POST')
  298. def do_upload():
  299. logger.info("==request.POST.dict==>{}".format(request.POST.dict))
  300. if "callback_url" in request.POST.dict:
  301. callback_url = request.POST.dict["callback_url"][0]
  302. else:
  303. callback_url = ""
  304. time1 = time.time()
  305. upload = request.files.get('mydata')
  306. # todo add flag and sid
  307. flag = request.POST.dict.get('flag', [0])[0]
  308. sid = request.POST.dict.get('sid', [0])[0]
  309. # logger.info("------flag==>: {}".format(str(flag)))
  310. # logger.info("------sid==>: {}".format(str(sid)))
  311. # logger.info("------mydata==>: {}".format(str(upload)))
  312. # logger.info("------callback_url==>: {}".format(str(callback_url)))
  313. if os.path.splitext(upload.raw_filename)[1] not in [".doc", ".docx"]:
  314. return "only accept .doc .docx files"
  315. num_str = str(get_dir_next_num(save_path))
  316. cur_save_path = os.path.join(save_path, num_str)
  317. os.mkdir(cur_save_path)
  318. upload.raw_filename = num_str + os.path.splitext(upload.raw_filename)[1]
  319. upload.save(cur_save_path, overwrite=True) # 把文件保存到save_path路径下
  320. filename = os.path.join(cur_save_path, upload.raw_filename)
  321. filename = os.path.abspath(filename)
  322. filename_root = os.path.abspath(cur_save_path)
  323. logger.info("---filename_root==>:{}".format(str(filename_root)))
  324. logger.info("---filename==>:{}".format(str(filename)))
  325. time11 = time.time() - time1
  326. logger.info("---save_filename_time==>:{}".format(str(time11)))
  327. # 开一个进程对filename 进行解析
  328. if callback_url:
  329. q_parse_file.put([filename_root, filename, callback_url, flag, sid])
  330. print("-----当前还有{}-----".format(q_parse_file.qsize() - 1))
  331. return json.dumps({
  332. "errcode": 0,
  333. "errmsg": "OK",
  334. "docsbefore": q_parse_file.qsize() - 1,
  335. }, ensure_ascii=False).encode("utf-8")
  336. else:
  337. try:
  338. res = parse_word(filename, None)
  339. # print("\n解析结果===>\n{}\n\n".format(res))
  340. print("------------parse is successful---------------")
  341. except:
  342. print("+++++++++++++++time out of parse_word++++++++++++++++++")
  343. res = {"time_out": 3,
  344. "errcode": 1,
  345. "errmsg": "word读取失败。请尝试:\n 1.请将word中图片改为嵌入式,然后上传。\n 2. 请将word题目拆分为多个word上传。\n 3.请将word中内容复制到新word上传。"}
  346. return json.dumps(res, ensure_ascii=False, indent=4).encode("utf-8")
  347. from PIL import Image
  348. def resize_img(img1, img2):
  349. im = Image.open(img1)
  350. if im.size[0] > 1000:
  351. ratio = 1000 / im.size[0]
  352. thumb = im.resize((int(im.size[0] * ratio), int(im.size[1] * ratio)), Image.ANTIALIAS)
  353. thumb.save(img2)
  354. @route('/static/<filepath:path>', method='GET')
  355. def server_static(filepath):
  356. # 5/files/image6.png
  357. # print("static_file: ^{}$".format(filepath))
  358. # logger.info("static_file: ^{}$".format(filepath))
  359. file = os.path.join(save_path, filepath)
  360. if "image" in file:
  361. resize_img(file, file)
  362. return static_file(filepath, root=save_path)
  363. static_image_pat = re.compile(
  364. r'(static/\d+/files/image\d+\.(webp|bmp|pcx|tiff|gif|jpeg|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|png|hdri|raw|wmf|flic|emf|ico))')
  365. @error(404)
  366. def error404(error):
  367. request_str = str(request)
  368. # '<LocalRequest: GET http://10.19.150.196:18084/favicon.ico>'
  369. if 'favicon.ico' in request_str:
  370. return 'Nothing here, sorry'
  371. # request_str = '<LocalRequest: GET http://123.59.151.182:18082/%22http%3A/123.59.151.182%3A18083/static/83/files/image2.png/%22>'
  372. m = static_image_pat.search(request_str)
  373. if m:
  374. filepath = m.group(1) # static/83/files/image2.png
  375. filepath = filepath.replace("static", "upload")
  376. if os.path.isfile(filepath):
  377. return "Nothing here, sorry"
  378. print("error404: remote_addr={}, request={}".format(str(request.remote_addr), request_str))
  379. # logger.error("error404: remote_addr={}, request={}".format(str(request.remote_addr), request_str))
  380. return 'your ip: {}<br>' \
  381. 'your request: ^{}$<br>' \
  382. 'Nothing here, sorry<br>'.format(str(request.remote_addr), request_str.replace("<", "").replace(">", ""))
  383. if __name__ == "__main__":
  384. # 解析文件的队列
  385. prs = []
  386. q_parse_file = Queue()
  387. start_word2html_app(kill_mathtype=True)
  388. # 读取解析进程
  389. # for i in range(5): # will call c shape word2html interface
  390. pr = Process(target=read, args=(q_parse_file,))
  391. pr.start()
  392. prs.append(pr)
  393. print("parse pid:", pr.pid)
  394. # sleep(2)
  395. # # 图片供应进程
  396. # pr = Process(target=run, args=(),
  397. # kwargs={"host": config.internal_ip, "port": config.file_port, "server": "tornado"})
  398. # pr.start()
  399. # prs.append(pr)
  400. # print("image pid:", pr.pid)
  401. # sleep(2)
  402. # 响应请求
  403. print("main pid:", os.getpid())
  404. print("main port:", config.server_port)
  405. # delete files
  406. # print("\n--------delete file process is start--------\n")
  407. # dojob()
  408. run(host=config.internal_ip, port=config.server_port, server="tornado")