ceshi.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. # !/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. from flask import Flask, render_template, request
  4. from datetime import timedelta
  5. import requests
  6. from utils.img2latex import get_ocrlatex_by_url
  7. headers = {'Pragma': 'no-cache', 'Cache-Control': 'no-cache',
  8. 'X-USER-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIyMCIsInVzZXJOYW1lIjoiSFgwMDEyMTM4IiwiZXhwIjoxNTY2Mzg3ODA1LCJ1c2VySWQiOiIyMCIsImlhdCI6MTU2NjM3MzQwNX0.9w__2RGncpdMvbiQAmk75ThgPwDaTXF1VyY1xKVs_zFduUEoJ3_6X2q3ZVYsFIpChKeLEWelhwlmMxEP4lU9QA',
  9. 'Origin': 'http://xx.cn',
  10. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
  11. 'Accept': '*/*',
  12. # 'Content-Type': 'multipart/form-data; boundary=----WebKitFormBoundarycDnnz7eKalCPv6GJ',
  13. 'Referer': '', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'}
  14. url = "http://192.168.1.140:18085/upload"
  15. import json
  16. data = {'callback_url': "http://zsytk.dev.xueping.com/wordapi/index/wid/9668",
  17. "mydata": json.dumps([{"file_type": "", "file_url": "http://zsytk.dev.xueping.com/data/word/2020/08/5f327c08f3576.doc"}],ensure_ascii=False).encode("utf-8")
  18. }
  19. # print(requests.Request('POST', url, data=data, json=json).prepare().body.decode(
  20. # 'ascii')) # 打印字段名和类型
  21. # r = requests.post(url, data=data, headers=headers)
  22. # print(r.text)
  23. #
  24. # { %
  25. # for sub in item[2:5] %}
  26. # { % if "src" in sub %}
  27. # < img
  28. # src = "data:;base64,{{ img_stream }}" >
  29. # { % endif %}
  30. # { % endfor %}
  31. import re
  32. pattern_2 = re.compile(
  33. r"(([CDE]\s*[..、、]|\([CDE]\)).+?)(?<![::])\s\(([1-9]|1[0-9])\)(.+?([是为有]|等于)[((]\s*[))]\n)")
  34. w1 = "C.jfg (3)mvm等于()\n"
  35. # p1=re.search(r"每(小?题|空)[\u4e00-\u9fa5]{,5}(\d[\d.]{,2}\d{,2})分", "二、填空题(每空2分,共8分)")
  36. # print(p1)
  37. # p2 = re.findall("([1-9]|[1-4][0-9])[))]?\s*(?=题)", "33题")
  38. # print(p2)
  39. #
  40. # print(list(range(1,2)))
  41. # print("{0}{1}".format(3,5))
  42. # def fun1(nn, **kwargs):
  43. # print(kwargs)
  44. # print(kwargs['index'])
  45. # fun1(1, all_type=1, num=2, index=3)
  46. # print(2==2>3)
  47. # p1 = re.search("(?<![::..、、])\s+([1-9]|[1-4][0-9])\s*[..、、](?!png)", 'yy 49.png')
  48. # print(p1)
  49. # items_str = '\n1.yy22.pn'
  50. #
  51. # for no in re.findall(r'\n+\s*([1-9]|[1-4][0-9])\s*[..、、]|(?<![::..、、])\s*([1-9]|[1-4][0-9])\s*[..、、](?!png)', items_str):
  52. # print(no)
  53. #
  54. # con_id_line = [] # 题号的行索引,第几行
  55. # topicno = [] # 题号序列
  56. # topicno_line_dict = dict(zip([], [])) # 题号to行索引字典
  57. # print(len(topicno_line_dict))
  58. # print("$\\frac")
  59. #
  60. # w_info = re.search(' width="(\d+[.\d]*?)\s*([pxt]?)"', ' width="16"')
  61. # print(w_info.group(1),w_info.group(2))
  62. #
  63. # ee = [1,2,3]
  64. # ee=sorted(ee,reverse=True)
  65. # print(ee)
  66. # from PIL import Image
  67. # local_p = r"F:\zwj\Text_Structure\img_folder\5fc64a514994183dda7e74ea\new_image1624343877418770.png"
  68. # w = Image.open(local_p)
  69. # w.close()
  70. from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
  71. kk = open(r"F:\zwj\parse_online58\res_folder\76736f316fe30323c8ddc51cf2b3f392__2021_08_30_10_15_58__828__1870.json",
  72. 'r', encoding='utf-8')
  73. con=json.loads(kk.read())
  74. item_str = str(con['items'][4])
  75. item_str = re.sub('(<img src=".*?".*? data-latex=)"\$.+?\$"',r'\1""', item_str)
  76. print(eval(item_str))
  77. print('------------------------------------')
  78. all_imgs_no_latex = re.findall(r'(<img src="(.*?)".*? data-latex="\$?\s*\$?")', item_str)
  79. if all_imgs_no_latex:
  80. imgurl_no_latex = [k[1] for k in all_imgs_no_latex]
  81. imgurl_no_latex = list(set(imgurl_no_latex))
  82. print(imgurl_no_latex)
  83. print('--------------------------------')
  84. # mathpixs = []
  85. # if len(imgurl_no_latex) <= 20:
  86. # mathpixs.extend([get_ocrlatex_by_url(i) for i in imgurl_no_latex])
  87. # else:
  88. # if len(imgurl_no_latex) <= 50:
  89. # executor = ThreadPoolExecutor(max_workers=2) # 开2个线程会稍微快点
  90. # else:
  91. # executor = ThreadPoolExecutor(max_workers=4)
  92. # for data in executor.map(get_ocrlatex_by_url, imgurl_no_latex):
  93. # mathpixs.append(data)
  94. mathpixs = [' 1: 1 ', ' b ', ')', ' 1: 1 ', '', ' a^{2}: b^{2} ', '(', 'R _ { 1 } = \\frac { p h } { a ^ { 2 } }', 'R _ { 2 } = \\frac { p b } { b h } = \\frac { p } { h }', ' a: b ', ' h ', ' R_{1}: R_{2}=b^{2}: a^{2} ', ' R_{2}=\\frac{\\rho h}{b^{2}} ', ' a: b ', ' a ']
  95. def sub1(ss):
  96. new_ss = ss.group(1) + ' data-type="math-ocr" ocr-latex="$'+mathpixs[mi]+'$"'+ss.group(2)
  97. return new_ss.replace(' data-type="math"', "")
  98. # r'\1 ocr-latex="${}$"'.format(mathpixs[mi])
  99. for mi, m in enumerate(imgurl_no_latex):
  100. # item_str = re.sub(r'(<img src="' + m + r'".*?) data-latex="\$?\s*\$?"',
  101. # sub1, item_str)
  102. if mathpixs[mi]:
  103. item_str = re.sub(r'(<img src="{}".*?) data-latex="\$?\s*\$?"(.*?(?<=")\s*/?>)'.format(m),
  104. sub1, item_str)
  105. items_list = eval(item_str)
  106. print(items_list)