washutil.py 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. # 本文件包含以下函数
  4. # table_label_cleal:去掉表格中的换行符
  5. # html_cleal :html文件清洗
  6. # huanhang_wash_after: 处理最终结果多余的换行符
  7. import datetime
  8. import random
  9. import re
  10. from operator import itemgetter
  11. from itertools import groupby
  12. from PIL import Image
  13. # import ps_configs as config
  14. from pprint import pprint
  15. import base64, os, random
  16. import time, hashlib
  17. # UPLOAD_FOLDER = config.UPLOAD_FOLDER
  18. import configs
  19. from utils.field_eq2latex import get_latex
  20. def table_label_cleal(con):
  21. """
  22. 去掉表格中的【换行符】
  23. """
  24. # print(con)
  25. # print('------------------------------------------')
  26. con = re.sub(r"\n(\s|\n|\t)+", "\n", con)
  27. count = 1
  28. while re.search(r"</?[a-z]+>\n(</?[a-z]+>|<td\s+\n*[a-z=\"\d]+>)", con, re.S) and count <= 10:
  29. con = re.sub("(</?t[dr]>|</?table>|</?tbody>|</?div>)\n(</?t[dr]>|</div>|</?table>|</?tbody>|<p>)",
  30. r"\1\2", con, flags=re.S)
  31. con = re.sub(r'(</?t[rd]>)\n(<td\s.+?>)', r'\1\2', con, flags=re.S)
  32. count += 1
  33. # if re.search(r"<table>(.|\n)+?</table>", con, re.S|re.M):
  34. # aa = re.search(r"(<table>(.|\n)+?</table>)", con, re.S|re.M)
  35. # con = con.replace(aa.group(1),aa.group(1).replace("\n",""))
  36. # 将空表格的情况去掉
  37. con = re.sub(r'<table>[\s\n\t]*?<tbody>[\s\n\t]*?(<tr>[\s\n\t]*?<td>[\s\n\t]*?<p>[\s\n\t]*?</p>'
  38. r'[\s\n\t]*?</td>[\s\n\t]*?</tr>[\s\n\t]*?)+</tbody>[\s\n\t]*?</table>[\s\n\t]*?<p>', "", con, flags=re.S)
  39. con = re.sub(r'(</table><p>)\s*([((]\s*\d\s*[))])', r'\1\n\2', con)
  40. return con
  41. # 标签清洗
  42. def html_cleal(html, img_url, is_reparse):
  43. sub_list = ["</?div>", "</?b>", "</?caption>", "</?center>", "</?cite>", "</?code>", "</?colgroup>",
  44. "</?menu>", "</?dd>", "</?dir>", "</?li>", "</?em>", "</?article>", "</?header>", "</?ruby>",
  45. "</?summary>", "</?details>", "</?strong>", "</?strike>", "</?small>", "</?select>",
  46. "</?section>", "</?script>", "</?[su]>", "</?var>", "</?ul>", "</?tt>", "</?title>", "</?thead>",
  47. "</?tfoot>", "<hr />", "<hr>",""]
  48. sub_dd = {'&times;': '×',
  49. '&divide;': '÷',
  50. '&deg;': '°',
  51. '&middot;': '·',
  52. '&plusmn;': '±',
  53. '&ordm;': 'º',
  54. '&sup1;': '¹',
  55. '&sup2;': '²',
  56. '&sup3;': '³',
  57. '&frac12;': '1/2',
  58. '&frac14;': '¼',
  59. '&frac34;': '¾',
  60. '&yen;': '¥',
  61. 'm&sup3;': 'm³',
  62. '&lt;': '<',
  63. '&pound;': '£',
  64. '∠&lt;': '<',
  65. '&gt;': '>',
  66. "A": "A",
  67. "А": "A",
  68. "Α": "A",
  69. "B": "B",
  70. "В": "B",
  71. "в": "B",
  72. "Β": "B",
  73. "C": "C",
  74. "С": "C",
  75. "c": "c",
  76. "с": "c",
  77. "D": "D",
  78. "Ε": "E",
  79. "E": "E",
  80. "F": "F",
  81. "G": "G",
  82. "g": "g",
  83. "m": "m",
  84. "N": "N",
  85. "s": "s",
  86. "t": "t",
  87. "/": "/",
  88. "=": "=",
  89. "-": "-",
  90. "2": "2",
  91. '&nbsp;&nbsp;': ' ',
  92. '&nbsp;': ' ',
  93. "〖": '【',
  94. "〗": '】',
  95. "題": '题',
  96. "单项选择": '单选',
  97. "多项选择": '多选',
  98. "不定项选择": '选择',
  99. "双项选择": '选择',
  100. }
  101. # 再解析中,将二进制图片进行转化,图片怎么保存比较好,先再“天数”建立文件夹
  102. if is_reparse:
  103. # 按“天数”建立文件夹
  104. time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d')
  105. file_path = os.path.join(configs.IMG_FOLDER, time_str)
  106. if not os.path.exists(file_path):
  107. os.makedirs(file_path)
  108. # 统计所有base64编码
  109. all_base64_image = re.findall('<img src="data:image[^>]+?"\s*>', str(html))
  110. for n, img in enumerate(all_base64_image):
  111. img1 = img.split(",")
  112. img_tape_info = re.search("data:image/(.+?);base64", img1[0])
  113. img_tape = img_tape_info.group(1) if img_tape_info else ""
  114. img_data = base64.b64decode(str(img1[-1]))
  115. if img_tape:
  116. # save_path = os.path.join(configs.new_img_ip, get_md5(n)+"."+img_tape)
  117. img_name = "image" + get_md5(n) + "." + img_tape
  118. save_path = os.path.join(file_path, img_name)
  119. img_path = configs.new_img_ip + '/' + time_str + '/' + img_name
  120. # img_file_count = 0
  121. # if os.listdir(configs.IMG_FOLDER):
  122. # img_file_count = max([int(i) for i in os.listdir(UPLOAD_FOLDER)]) + 1
  123. with open(save_path, 'wb') as f:
  124. f.write(img_data)
  125. new_img = '<img src="' + img_path + '" />'
  126. html = html.replace(img, new_img)
  127. # -------------------------------------------------------------------------------------
  128. # 特殊符号处理
  129. html2txt = re.sub(r"|".join(sub_list), "", str(html)) # ("", " ") #2020/4/7
  130. html2txt = re.sub("|".join(sub_dd.keys()), lambda x: sub_dd[x.group()], html2txt) # 2020/4/1,4/7,4/20
  131. html2txt = html2txt.replace(r"\\[{\\text{V}}V\]", "Ⓥ").replace(r"\\[{\\text{A}}A\]", "Ⓐ") \
  132. .replace(r"\\[{\\text{W}}W\]", "Ⓦ").replace(r"\\[{\\text{X}}X\]", "Ⓧ").replace(r"\\[{\\text{G}}G\]", "Ⓖ") \
  133. .replace("\uf067", "γ").replace('', "γ").replace('\uf020', "").replace("\u3000", " ").replace("\u2003", " ") \
  134. .replace("\x7f", " ").replace("\xa0", "")
  135. # 域公式的转化处理
  136. html2txt = get_latex(html2txt).replace("【域公式】", "")
  137. # <sub>\<sup>可以在前端显示,不需要用latex渲染
  138. # <br/>处理
  139. html2txt = re.sub("<br\s*/?>", "\n", html2txt)
  140. # 题型行的统一处理
  141. # ---->>>>>题型行可能放在表格中
  142. if len(re.findall("<table>", html2txt)) >= 6: # 这个限制还不太严谨
  143. for tt in re.finditer('<tr>(((?!(</?tr>)).)*)</tr>', html2txt, re.S):
  144. tt_list = re.split(r'</p></td>|<td><p>', tt.group(1))
  145. tt_list = [col for col in tt_list if col.strip()]
  146. if " ".join(tt_list).replace(" ", "") == '得分评卷人':
  147. html2txt = html2txt.replace(tt.group(0), "")
  148. else:
  149. html2txt = html2txt.replace(tt.group(0), "<p>" + " ".join(tt_list) + "</p>")
  150. html2txt = re.sub(r"</?tbody>|</?table>|</?div>", "", html2txt)
  151. html2txt = re.sub(r"(</table>)\s*([一二三四五六七八九十]\s*[、..、]?.{2,6}题)", r"\1</p>\2", html2txt)
  152. html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,]\s*(论述|填空|探究)题?[与和、、,,\s]*?(计算题|实验题)', r"\1、\3", html2txt)
  153. html2txt = re.sub(r"<p>\s*([一二三四五六七八九十])\s*[、..、,,]?\s*(计算|[解简]答|实验|作图)题?[与和、、,,\s]*?(计算|[解简]答|实验|作图)", r"<p>\1、\2题", html2txt)
  154. html2txt = re.sub(r'<td><p>(([一二三四五六七八九十])\s*[、..、,,]\s*(.{2,4}题)\s*</p>)</td>[^p]*?<p>', r"\1", str(html2txt), flags=re.S)
  155. html2txt = re.sub(r'<p>\s*[((]\s*[一二三四五六]\s*[))]\s*必考题\s*(.?|.+?分\s*[.。.]?)\s*</p>', "", html2txt)
  156. html2txt = re.sub(r'<p>\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*.?\s*.{,4}(?<!\d)(\d+分)\s*[,,。].{,50}</p>', r"<p>【选做题】:'\1'</p>", html2txt)
  157. html2txt = re.sub(r'<p>\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*(.?|.+?分\s*[.。.]?)\s*</p>', "<p>【选做题】</p>", html2txt)
  158. html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、..、,,]?\s*(单项?选择?|选择|多项?选择?|填空|计算|[解简]答|实验|作图)题?\s*</p>',
  159. r"<p>\1、\2题</p>", html2txt)
  160. html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,]?\s*(单选|单项选择|选择|多选|多项选择|填空|计算|[解简]答|实验|作图)\s*(?!题)([((]\s*本题|.*?\d分)',
  161. r"\1" + "、" + r'\2' + "题" + r"\3", html2txt)
  162. html2txt = re.sub(r'([一二三四五六])\s*[、..、,,]?\s*(单选|单项选择|选择|多选|多项选择|填空|计算|[解简]答|实验|作图)题',
  163. r"\1" + "、" + r'\2' + "题", html2txt)
  164. html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,]?\s*[((]\s*本大题(.*?选项中)', r"\1" + "、" + "选择题", html2txt) # + r"\2"
  165. html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、..、,,]?\s*[((本大题]*?(.*?选项中)', r"\1" + "、" + "选择题", html2txt)
  166. html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,]?\s*([((]\s*(每小题|本大?题)((?!(选项)).)+?[))]|综合题)',
  167. r"\1" + "、" + "解答题", html2txt)
  168. html2txt = re.sub(r'(?<!<p>)\s*([一二三四五六七八九十]\s*[、..、,,]?\s*(单项?选择?|选择|多项?选择?|填空|计算|[解简]答|实验|作图)题)',
  169. r'</p>\n<p>\1', html2txt)
  170. html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、..、,,]?\s*[((]?本?大?题((?!(选项)).)+?[))]?\s*</p>', r"<p>\1、本大题</p>", html2txt)
  171. # html2txt = re.sub(r'<p>\s*[^一二三四五六七八九十]{,3}\s*[、..、]\s*(选择|单选|多选|计算|[解简]答|实验|作图)题', r"<p>一、\1题", html2txt)
  172. # 答案解析关键字的统一处理
  173. html2txt = re.sub(r'【\s*(<img src=.+)*?([解答])\s*(<img src=.+)*?([析案])\s*(<img src=.+)*?】', r"【\2\4】",
  174. str(html2txt)) # 2020/4/21
  175. html2txt = re.sub(r'<p>\s*(解\s*[::])', r"<p>【解答】", str(html2txt))
  176. html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答]|详解|点[评睛])[^【】]*?】】', r"【\1】", str(html2txt))
  177. html2txt = re.sub(r'(答案|解析|解答|详解|点评|点睛|考点|专题)\s*[::]', r"【\1】", str(html2txt))
  178. html2txt = re.sub(r'(\n|^)\s*(分析)\s*[::]', r"【\2】", str(html2txt))
  179. if "【解析】" not in html2txt and "【解答】" in html2txt and "【分析】" not in html2txt:
  180. html2txt = re.sub(r'【解答】', "【解析】", str(html2txt))
  181. # 其他关键字的处理
  182. html2txt = re.sub(r'<p>\s*(类型|知识点|考查角度|拔尖角度)[一二三四五六七八九十\d+][^p]*?</p>', "", str(html2txt))
  183. html2txt = re.sub(r'<p>\s*(选修[\d-]*?[::].{2,15})\s*</p>', r"<p>【章节】\1</p>", html2txt)
  184. html2txt = re.sub(r'<p>\s*([一二三四五六]\s*[、..、]?)?\s*(\[.{2}-*?选修[\d-]*?.*?\])\s*([((]\d+分[))])?\s*</p>',
  185. r"<p>【章节】\2</p>", html2txt)
  186. html2txt = re.sub(r'<p>\s*(基础|中档|综合)题[^p题]*?</p>|<p>\s*【(考点|专题)】[^p]*?</p>', "", str(html2txt))
  187. html2txt = re.sub(r'<p>\s*(基础训练|提升训练|探究培优)</p>', "", str(html2txt))
  188. html2txt = re.sub(r'<p>注意事项[::]\s*</p>(\n+\s*<p>\s*\d\s*[、..、][^/]+?</p>){1,}', "", html2txt, flags=re.S)
  189. html2txt = re.sub(r'<p>注意事项[::]\s*\d\s*[、..、][^/]+?</p>(\n+\s*<p>\s*\d\s*[、..、][^/]+?</p>){1,}', "", html2txt, flags=re.S)
  190. html2txt = re.sub(r'[((]\s*([A-Z\dⅠⅡⅢⅣⅤ]+|IV)\s*[))]', r"(\1)".replace(" ", "").replace("(IV)", "Ⅳ"), html2txt)
  191. html2txt = re.sub(r'[((](\s*\d\s*\d?\s*分?\s*)[))]', "(" + r'\1'.replace(" ", "") + ")", html2txt)
  192. html2txt = re.sub(r'\[来源:.*?\]', "", html2txt)
  193. html2txt = re.sub('<p>欢迎访问.*?</p>', '', html2txt)
  194. html2txt = re.sub('w\s*w\s*w\..*?(\.\s*c\s*o\s*m|\.cn)+|(?<!["“=])http:.*?\.(com|cn|org)', "", html2txt) # ww w.gkstk.c om
  195. html2txt = re.sub(r'<(table|td|tr) [a-z]+="\d+">', r'<\1>', html2txt)
  196. html2txt = re.sub('<p>\s*第\s*[二三四ⅡⅢⅣ]\s*(卷|部分)\s*([((].*?[))]|非?选择题.{,8})?\s*</p>', "\n", html2txt)
  197. # 选项的处理
  198. html2txt = re.sub(r'(<p>\s*([1-9]|[1-4][0-9])\s*[..、、].+?[((]\s*[))])\s*(A\s*[..、、][^/]*?</p>)',
  199. r"\1</p>\n<p>\3", str(html2txt))
  200. # 根据图片宽高的异常值判断删除隐藏图片---------------------------------------
  201. # a = re.search(r'<img src=[^<]*? width="([\d.]+)pt" height="([\d.]+)pt"\s*/>', html2txt, re.S)
  202. # while a and float(a.group(1)) <= 2 and float(a.group(2)) <= 2:
  203. # print(a.group(1))
  204. # html2txt = html2txt.replace(a.group(0), "")
  205. # a = re.search(r'<img src=.*? width="([\d.]+)pt" height="([\d.]+)pt"\s*/>', html2txt, re.S)
  206. def sub1(ss):
  207. if float(ss.group(1)) <= 2 and float(ss.group(2)) <= 2:
  208. return ""
  209. else:
  210. return ss.group(0)
  211. html2txt = re.sub(r'<img src=.*? width="([\d.]+)pt" height="([\d.]+)pt"\s*/?>',sub1, html2txt)
  212. # -------------------------------------------------------------
  213. # 将图片中带有的汉字去掉
  214. html2txt = re.sub(r'(<img src=.*?)alt=".+?"', r"\1", html2txt)
  215. # 题号的处理
  216. html2txt = re.sub(r'([ED]\s*[、..、].*?(\s|</su[pb]>\s*))(([1-9]|[1-4][0-9])\s*[、..、])', r"\1</p>\n<p>\3", html2txt)
  217. html2txt = re.sub(r'(</?p>\s*(<img src=.*?"\s*/?>\s*)?([1-9]|[1-4][0-9]))\s*([((]\s*(\d{1,2}[.\s\d]*?分|.{2,3}题?)\s*[))]|解析?\s*[::]|【解析】)',
  218. r"</p>\1、\4", html2txt)
  219. html2txt = re.sub(r"<p>\s*([1-9]|[1-4][0-9])\s*([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"<p>\1、\2", html2txt)
  220. html2txt = re.sub(r"<p>\s*([1-9]|[1-4][0-9])\s*(【(解析?|答案?)】|(解析?|答案?)\s*[::])", r"<p>\1、\2", html2txt)
  221. html2txt = re.sub(r"<p>\s*(<img src=((?![/\"]>).)+?[/\"]>)\s*([1-9]|[1-4][0-9])\s*([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])",
  222. r"<p>\1</p>"+"\n"+r"<p>\3、\4", html2txt) # 【susp_img】
  223. html2txt = re.sub(r'</?p>((\s*<su[bp]>\s*)?<img src=.*? height="[\d.]+pt"\s*/?>(\s*</su[bp]>)?\s*)(([1-9]|[1-4][0-9])\s*[、..、])',
  224. r"</p>\1</p>" + "\n" + r"\4", html2txt)
  225. html2txt = re.sub(r"(<p>((?!<p>).)+?(\s|[/\"]>))(([1-9]|[1-4][0-9])\s*[、..、].{,20}本[大小]?题\d+分)", r"\1</p>" + "\n<p>" + r"\4",
  226. html2txt)
  227. html2txt = re.sub(r"</?p>((\s*<su[bp]>\s*)?<img src=.*?[/\"]>(\s*</su[bp]>)?((\s*<su[bp]>\s*)?<img src=.*?[/\"]>(\s*</su[bp]>)?)*?\s*)"
  228. r"\s*(([1-9]|[1-4][0-9])\s*[、..、])", r"</p>\1</p>" + "\n<p>" + r"\7", html2txt)
  229. html2txt = re.sub(r'(<p>\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-4][0-9])\s*[、..、].*?)</p>',
  230. r"\1</p>\n<p>\2</p>", html2txt)
  231. html2txt = re.sub(r'(<p>\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-4][0-9])\s*[、..、].*?)</p>',
  232. r"\1</p>\n<p>\2</p>", html2txt)
  233. html2txt = re.sub(r'(<p>.*?[..]{6,}\s*\d+分)\s*(([1-9]|[1-4][0-9])\s*[、..、].*?)</p>', r"\1</p>\n<p>\2</p>", html2txt)
  234. html2txt = re.sub(r'([1-9]|[1-4][0-9])\s*([((]\s*\d{1,2}[.\s\d]*?分\s*[))])\s*[、..、]', r"\1" + "、" + r"\2", html2txt)
  235. # 建立图片id字典,对原图片信息第一次替换
  236. all_image = re.findall(r'<img src=".*?[/\"]>', html2txt)
  237. src2subs = {}
  238. subs2src = {}
  239. for src in all_image:
  240. kk = re.search('(<img src=".*?image[\da-z]+\.(png|gif|jpg|jpeg))', src)
  241. # 校本题库上传的图片名称是随机数,故设置映射
  242. new_src = src.replace(kk.group(1), img_url[kk.group(1)]) if type(img_url) == dict and kk else src
  243. # -----------------图片大小调整,解析后的图片大小出入跟office word有关-----------------------------------
  244. # 对部分显示太小的图片信息进行调整,上传的图片的图片会自动根据信息显示大小
  245. # px_info = re.search('<img src="files/image(\d+)\..*?'
  246. # ' width="([\d.]+)pt" height="([\d.]+)pt"\s*/>', src)
  247. # if re.search(" data-latex=", src) and px_info and type(img_url) == str:
  248. # if int(px_info.group(4)) < ref_v - 2: # 图片太小
  249. # h_pt = (ref_v - 1) * 72 / 96
  250. # w_px = int(px_info.group(3)) / int(px_info.group(4)) * (ref_v - 1)
  251. # w_pt = w_px * 72 / 96
  252. # new_src = new_src.replace('height="' + px_info.group(4), 'height="15') \
  253. # .replace('width="' + px_info.group(3), 'width="' + str(w_px)) \
  254. # .replace(px_info.group(2), 'style="width: ' + str(w_pt) + 'pt; height: ' + str(h_pt) + 'pt"')
  255. # # .replace("&lt;", "<").replace("&gt;", ">") # replace("&quot;", '"')
  256. # elif int(px_info.group(4)) > ref_v + 2 and type(img_url) == 'str': # 公式图片太大或公式图片原本就大但被缩小的情况
  257. # 第二种修改图片的方法:读取原图,获取大小
  258. # ----------------------------------------------------------------------------------
  259. # 图片信息简化替换
  260. src_info = re.search(r'<img src=".*?/image([\da-z]+)\..*?(data-latex=".*?")', src)
  261. mathpix = " "+src_info.group(2).replace("\n", "").strip().replace(" ", "") if src_info else ""
  262. if mathpix and len(mathpix) > 20:
  263. mathpix = ""
  264. w_h_info = re.search('<img src=".*?/image([\da-z]+)\..*?width="([\d.]+)pt"\s*height="([\d.]+)pt"\s*/?>', src)
  265. w_h = " w_h=" + w_h_info.group(2).split('.')[0] + "*" + w_h_info.group(3).split('.')[0] \
  266. if w_h_info and not mathpix else "" # w_h 和 mathpix只存在一个
  267. src2subs[src] = '<imgsrc'+re.search(r'<img src=".*?/image([\da-z]+)\.', src).group(1)+w_h+mathpix+"/>"
  268. subs2src['<imgsrc'+re.search(r'<img src=".*?/image([\da-z]+)\.', src).group(1)+w_h+mathpix+"/>"] = new_src
  269. for k, v in src2subs.items():
  270. html2txt = html2txt.replace(k, v)
  271. # print(src2subs)
  272. # ------------------------------------------------------------------------
  273. # html 转 list
  274. html2txt = re.sub(r'(</?div>|</table>|<?body>)(\n\s*)*?<p>', r"\1</p>"+"\n<p>", html2txt, flags=re.S)
  275. con_list = sum([re.split('<p>|<h[12345]>', i) if len(re.findall("<p>|<h[12345]>", i))>1 else [i] for i in
  276. re.split(r"</p>(?!</td>)|</h[12345]>", html2txt)[:-1]], [])
  277. con_list = [re.sub(r"^\n*\s*(<p>|<h[12345]>)+", "", ii) for ii in con_list]
  278. # con_list = [re.sub(r"^\n*\s*(<p>|<h[12345]>)+", "", ii) for ii in
  279. # re.split(r"</p>(?!</td>)|</h[12345]>", html2txt)[:-1]]
  280. con_list = [re.sub(r"^<([a-z]+)>[\s\t\n]*</\1>$", "", i.strip()) for i in con_list] # 2020/4/7,14
  281. con_list = [re.sub(r"^<table>(.|\n)+?([一二三四五六七八九十])\s*[、..、]\s*(.{2,4}题)(.|\n)+?</table>", r"\2、\3", i.strip())
  282. for i in con_list]
  283. # 把最后可能还存在的</?p>或考号信息去掉
  284. con_list = [re.sub("</?p>|[…O•.\s]*?密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?[….O•\s]*?$"
  285. "|((学校|班级|姓名|座位号|准考号|[学考]号)[\s::_]*?){2,}$", "", i.strip()) for i in con_list]
  286. # 答案行格式处理
  287. temp_list = [re.split("^((\s*<imgsrc.*?[/\"]>\s*)+)", v.strip(), maxsplit=1)
  288. if re.match(r'(\s*<imgsrc.*?[/\"]>\s*)+?(参考|考试|试[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则))\s*$'
  289. r'|(\s*<imgsrc.*?[/\"]>\s*)+?评分标准'
  290. r'|(\s*<imgsrc.*?[/\"]>\s*)+?(参考|考试|试[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则))\s*(物理|理综|数学|化学|生物)?\s*$',
  291. re.sub(r"[上下]?学[年期]|[\d—【】..、、::(())年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]|[高初][一二三]", "",
  292. v.strip())) else [v] for v in con_list]
  293. con_list = sum(temp_list, [])
  294. # 对可能的题号的处理 如2、3、4、5、 加了【fei】 # 重新修改!!!!!!!!!!
  295. con_list = [re.sub(r"^\s*([1-9][0-9]?\s*[..、、])", r"【fei】\1", i.strip())
  296. if (len(re.findall(r"(^|\s*[..、、])\s*[1-9][0-9]?\s*[..、、]", i)) >= 3
  297. and len(re.sub(r"[\d..、、\s]", "", i)) < 2) else i for i in con_list]
  298. # print(con_list)
  299. if con_list and re.search(r"[\u4e00-\u9fa5]", con_list[0]) is None:
  300. con_list = con_list[1:]
  301. while con_list and re.search(r"声明[::].*?著作权属.*?所有|(邮箱|用户|日期|QQ)\s*[::].+?", con_list[-1]):
  302. con_list = con_list[:-1]
  303. return con_list, subs2src
  304. def del_no(item, item_no_type=1):
  305. """去开头的题号"""
  306. if item_no_type==2:
  307. item = re.sub('^\n*\s*[((]\s*([1-9]|[1-4][0-9])\s*[))]\s*[..、、::]?', "", item)
  308. return item
  309. item = re.sub('^\n*\s*([1-9]|[1-4][0-9])\s*[..、、::]', "", item)
  310. return item
  311. def html_cleal_test(htmlf): # 不用
  312. html2txt = re.sub(r"&nbsp;", "", htmlf.read()) # ("", " ")
  313. # html2txt.replace("①", "(1).").replace("②", "(2).").replace("③", "(3).")
  314. con_list = [re.sub(r"^\n+\s+<p>", "", ii) for ii in html2txt.split("</p>")[:-1]]
  315. # pprint(con_list)
  316. if re.search(r"[\u4e00-\u9fa5]", con_list[0]) is None:
  317. con_list = con_list[1:]
  318. return con_list
  319. def get_md5(image_id):
  320. """
  321. 由于hash不处理unicode编码的字符串(python3默认字符串是unicode)
  322. 所以这里判断是否字符串,如果是则进行转码
  323. 初始化md5、将image_name进行加密、然后返回加密字串
  324. """
  325. image_name = str(image_id) + str(time.time()) + str(random.random())
  326. image_name = image_name.encode("utf-8")
  327. md = hashlib.md5()
  328. md.update(image_name)
  329. return str(md.hexdigest())
  330. def huanhang_wash_after(res_dict):
  331. """
  332. 1.处理最终结果多余的换行符;2.对题文中已给答案的选择填空进行替换;3.选择题的细分
  333. :param res_dict:
  334. :struc_type:试卷类型,struc_type=1时为教师卷
  335. :return:
  336. """
  337. pattern1 = re.compile(r"([是为点]|等于|=|=|有|存在)\s*_+((<img src=.*?[/\"]>|[^_;;。?!,])+?)(?<![==_])_+([cdkm上]?m?\s*.?[。.?]?\s*($|<br/>|<img src|……))")
  338. pattern2 = re.compile(r"((有|存在|[是为])[\u4e00-\u9fa5]{0,2})\s*_+(\d+)_+\s*([\u4e00-\u9fa5,,;;。..])")
  339. chapter_no = {}
  340. option_st = 0
  341. is_optional = False
  342. option_score = 0
  343. select_type_id = []
  344. for num, sr in enumerate(res_dict):
  345. sr["content"] = re.sub(r"\n[_\-\s]*密[…O•.\s]*封[….O•\s]*装?[…O•.\s]*订?[….O•\s]*线?"
  346. r"|\n\s*((学校|班级|姓名|座位?号|准考号|学号)[\s::_]*){2,}", "", sr["content"])
  347. sr["content"] = re.sub('\n\s*(第\s*[^\s]\s*卷|第[一二三四]部分)\s*([((].*?[))]|非?选择题.{,8})?\s*\n', "\n", sr["content"])
  348. sr["content"] = sr.get("content", "").strip().replace("\n\n", "\n").replace("\n", "<br/>") # 2020/4/10 gai
  349. if num == len(res_dict)-1:
  350. end_con = sr["content"] + sr["parse"]
  351. if len(re.findall(r"[\u4e00-\u9fa5]", end_con))>1000 and (len(re.findall("\n\s*([1-9]|1[0-9])\s*[..、、].+?",
  352. end_con))>4 or len(re.findall("[((]\s*[))]|_{2,}", end_con))>6):
  353. sr['errmsgs'].append("原试卷格式有问题,导致本题可能包含了很多非本题的题文")
  354. if not re.sub("[(())\n\s]", "", sr["content"]):
  355. sr['errmsgs'].append("本题没有题干,请检查题干格式是否正确")
  356. # 把首尾的换行都去掉,php接收时会用换行来拼接
  357. # sr["content"] = table_label_cleal(re.sub(r"\n\s*","<br/>",sr.get("content", "").lstrip()))
  358. # 将选择题和填空题中的题干中出现答案的情况 去掉答案
  359. kuo_con1 = re.search('([是为]|等于|[==有]|表示)[((]([A-Z][A-Z;;和与、、]*?)[))](.?($|<br/>|<img))', sr["content"].replace(" ", ""))
  360. kuo_con2 = re.search("[((]([A-Z][A-Z;;和与、、]*?)[))](.?($|<br/>))", sr["content"].replace(" ", ""))
  361. if sr['item_topic_name'].replace("题", "") in ["单选", "多选", "选择", "单项选择", "多项选择"]:
  362. # sr["type"] = "选择"
  363. # 针对选择题在题文中已给出答案的处理
  364. if kuo_con1:
  365. sr["content"] = sr["content"].replace(kuo_con1.group(0), kuo_con1.group(1)+"( )" + kuo_con1.group(3))
  366. sr["answer"] = kuo_con1.group(2) if not sr["answer"] else sr["answer"]
  367. elif kuo_con2:
  368. sr["content"] = sr["content"].replace(kuo_con2.group(0), "( )" + kuo_con2.group(2))
  369. sr["answer"] = kuo_con2.group(1) if not sr["answer"] else sr["answer"]
  370. if "options" in sr: # 对选项部分进行格式处理
  371. for i in range(len(sr['options'])):
  372. sr['options'][i] = sr['options'][i].lstrip().replace("\n\n", "\n").replace("\n", "<br/>")
  373. # sr['options_text'] = ""
  374. elif sr['item_topic_name'] == '填空题':
  375. # sr["type"] = "填空"
  376. ans_list = []
  377. # 针对填空题在题文中已给出答案的处理
  378. while re.search(pattern1, sr["content"]):
  379. blank_con1 = re.search(pattern1, sr["content"])
  380. sr["content"] = sr["content"].replace(blank_con1.group(0), blank_con1.group(1)+"____" + blank_con1.group(4))
  381. ans_list.append(blank_con1.group(2))
  382. while re.search(pattern2, sr["content"]):
  383. blank_con2 = re.search(pattern2, sr["content"])
  384. # 这里的限制条件易出错,可以再判断一下
  385. sr["content"] = sr["content"].replace(blank_con2.group(0), blank_con2.group(1) + "____" + blank_con2.group(4))
  386. ans_list.append(blank_con2.group(2))
  387. if re.findall(r"_{2,}", sr["content"]):
  388. sr["blank_num"] = len(re.findall(r"_{2,}", sr["content"]))
  389. if not sr["answer"] and ans_list:
  390. sr["answer"] = "; ".join(ans_list)
  391. # 已知题型是错误的情况,如解答题,放在填空题中
  392. if 'blank_num' not in sr and re.search("_+([^_]*?)_+", sr['content']) is None:
  393. sr['errmsgs'].append("填空题题干中没有下划线(__),与题型(填空题)不符")
  394. # stem_c = re.sub("<img src=.*?/>|[,,.。.、、]", "", sr["content"])
  395. # if len(stem_c) > 2: # 不自动纠错
  396. # sr["item_topic_name"] = "解答题"
  397. # sr["type"] = "解答"
  398. else: # 大题题型先不做范围判断
  399. if sr['item_topic_name'] and sr['item_topic_name'].replace("题", "") not in ["解答", "计算", "实验","作图"]:
  400. sr["type1"] = "解答"
  401. else:
  402. sr["type1"] = sr['item_topic_name'].replace("题", "")
  403. # if "is_optional" not in sr:
  404. # sr["is_optional"] = is_optional
  405. sr["option_str"] = ""
  406. if "slave" in sr and sr["slave"]:
  407. # 带小题的大题,格式处理,高中数学没有这一功能
  408. for s in sr["slave"]:
  409. s["content"] = s.get("content", "").strip().replace("\n\n", "\n").replace("\n", "<br/>")
  410. # 已分小问了的题号,是不会带小题号的,故不需要替换
  411. # s["content"] = re.sub(r"[((]\s*(\d|ⅰⅱⅲⅳ|i{1,3})\s*[))]|[①②③④]\s*(?![+-])", "", s["content"][:5]) + s["content"][5:]
  412. s["parse"] = s.get("parse", "").strip().replace("\n\n", "\n").replace("\n", "<br/>")\
  413. .replace("解答:解:", "解答:").replace("解答:解:", "解答:")
  414. s["answer"] = s.get("answer", "").strip().replace("\n\n", "\n").replace("\n", "<br/>")
  415. # sr["slave"] = sr.get("slave", "").replace("\n", "<br>")
  416. else:
  417. sr["parse"] = sr.get("parse", "").lstrip().replace("\n\n", "\n").replace("\n", "<br/>")
  418. sr["parse"] = re.sub("^【解[答析]】\s*", "", sr["parse"])
  419. sr["answer"] = sr.get("answer", "").lstrip().replace("\n\n", "\n").replace("\n", "<br/>")
  420. if not sr["parse"] and not sr["answer"]: # 答案和解析都没有
  421. sr["parse"] = "略"
  422. sr["answer"] = "略"
  423. sr['errmsgs'].append("本题缺少答案和解析")
  424. elif not sr["answer"] and sr["parse"]:
  425. sr["answer"] = "见解析"
  426. elif sr["answer"] and not sr["parse"]:
  427. sr["parse"] = "略"
  428. sr['errmsgs'].append("本题缺少解析")
  429. # 辅助标签处理
  430. sr["analysis"] = ""
  431. if "analy" in sr: # 存在题目分析时,将其放在解析里
  432. sr["analysis"] = sr.get("analy", "").strip().replace("\n\n", "\n").replace("\n", "<br/>")
  433. # if len(sr["analy"].replace(" ", "")) >= 10:
  434. # sr["parse"] = sr["analy"] + "<br/>" + sr["parse"]
  435. del sr["analy"]
  436. if "chapter" in sr: # 如选修4-5:不等式选讲
  437. if sr['item_id'] + 1 <= len(res_dict):
  438. chapter_no[sr['item_id']] = sr["chapter"]
  439. del sr["chapter"]
  440. # 是否为选做题"is_optional",两种形式不会同时出现
  441. if "option_st" in sr: # 带有此标签的后面的题目都是选做题option_score
  442. option_st = sr['item_id']
  443. is_optional = True
  444. if "," in sr["option_st"]:
  445. option_score = int(sr["option_st"].split(",")[-1])
  446. del sr["option_st"]
  447. elif sr['item_topic_name'] == '选做题': # 题型是选做题 如五、选做题
  448. select_type_id.append(sr['item_id'])
  449. sr['is_optional'] = 'true'
  450. sr['score'] = option_score
  451. elif "type1" in sr and sr["type1"] == "解答" and "is_optional" not in sr:
  452. sr["is_optional"] = is_optional
  453. if is_optional:
  454. sr['score'] = option_score
  455. if "type1" in sr:
  456. del sr["type1"]
  457. # 将选择题改为单选或多选,"is_multiple_choice"
  458. sr['item_topic_name'] = re.sub("([单多])项选择题?", r"\1选题", sr['item_topic_name'])
  459. sr['item_topic_name'] = sr['item_topic_name'].replace("简答", "解答")
  460. # sr['item_topic_name'] = re.sub("(计算|简答)题?", "解答题", sr['item_topic_name'])
  461. # if sr['item_topic_name'] in ["选择", "选择题"]: # 有的科目只有选择题,不分单选和多选
  462. # if len(re.findall("[A-Z]", sr["answer"])) > 1:
  463. # sr['item_topic_name'] = '多选题'
  464. # else:
  465. # sr['item_topic_name'] = '单选题'
  466. if sr['item_topic_name'] == '多选题':
  467. if len(re.findall("[A-Z]", sr["answer"])) == 1:
  468. sr['errmsgs'].append("本题答案个数与题型(多选题)不符")
  469. # sr["is_multiple_choice"] = 'true'
  470. elif sr['item_topic_name'] == '单选题':
  471. # sr["is_multiple_choice"] = 'false'
  472. if "options" in sr and len(sr["options"]) > 4:
  473. sr['errmsgs'].append("选项个数多于4个,与题型(单选题)不符")
  474. if len(re.findall("[A-Z]", sr["answer"])) > 1:
  475. sr['errmsgs'].append("本题答案个数与题型(单选题)不符")
  476. # """按照原先高中数学解析的最后输出格式整理输出"""
  477. sr["stem"] = sr["content"]
  478. sr["type"] = sr['item_topic_name'].replace("非选择", "解答")
  479. sr["topic_num"] = sr['item_id']
  480. sr['errmsgs'] = ";".join(sr['errmsgs'])
  481. sr["parse"] = re.sub(r"试题【([分解]析)】", r"试题\1:", sr["parse"]) # 解析
  482. sr["key"] = re.sub("([;;]|<br/>)\s*$", "", sr["answer"])
  483. sr["slave_img"] = ""
  484. sr["parse_img"] = ""
  485. sr["stem_img"] = ""
  486. if 'susp_pic' in sr:
  487. del sr['susp_pic']
  488. if 'is_optional' in sr:
  489. del sr['is_optional']
  490. if 'spliterr_point' in sr:
  491. del sr['spliterr_point']
  492. del sr["content"], sr["answer"], sr['item_topic_name'], sr['score'],sr['item_id']
  493. # ------------------------------------------------------------------------
  494. # if chapter_no: # 章节标签下移一位
  495. # for c, v in chapter_no.items():
  496. # res_dict[c]["chapter"] = v
  497. # 选做题"option_str"处理
  498. if select_type_id:
  499. for s in select_type_id:
  500. if len(select_type_id) == 2:
  501. res_dict[s-1]['option_str'] = "2选1"
  502. elif len(select_type_id) == 4:
  503. res_dict[s - 1]['option_str'] = "4选2"
  504. else:
  505. res_dict[s-1]['text_errmsgs'] += ";<br/>选做题不是“2选1”和“4选2”类型"
  506. if option_st:
  507. print("option_st:", option_st)
  508. for s in range(option_st, len(res_dict)):
  509. if (len(res_dict) - option_st) == 2:
  510. res_dict[s]['option_str'] = "2选1"
  511. elif (len(res_dict) - option_st) == 4:
  512. res_dict[s]['option_str'] = "4选2"
  513. else:
  514. res_dict[s]['text_errmsgs'] += ";<br/>选做题不是“2选1”和“4选2”类型"
  515. return res_dict
  516. def insert_sort2get_idx(item_list, num):
  517. """
  518. :param item_list: 拍好序的列表
  519. :param num: 插入的数值
  520. :return: 插入的位置
  521. """
  522. add_n = 0
  523. for i in range(len(item_list)):
  524. if num > item_list[i]:
  525. add_n += 1
  526. else:
  527. break
  528. return add_n
  529. # def find_seq_num(num_list):
  530. # """
  531. # 针对切分题号时切错的序号进行纠正,考虑序号是连续且正常的情况下
  532. # 将连续的数字进行分组
  533. # :param num_list:输入[3, 4, 8, 9, 12, 13, 14]
  534. # :return: [[3, 4],[8, 9],[12, 13, 14]]
  535. # """
  536. # seq_ranges = []
  537. # for k, g in groupby(enumerate(num_list), lambda x: x[0] - x[1]):
  538. # group = (map(itemgetter(1), g))
  539. # group = list(map(int, group))
  540. # seq_ranges.append(group)
  541. # return seq_ranges
  542. # def del_exception_value(item_list):
  543. # """
  544. # 去列表中的异常值,题目越多,越容易突出异常值
  545. # :return:
  546. # """
  547. # import numpy as np
  548. # max_v = max(item_list)
  549. # arr_mean = np.mean(item_list) # 均值
  550. # arr_var = np.var(item_list) # 方差
  551. # while max_v > len(item_list)+4:
  552. # item_list.remove(max_v)
  553. # print(item_list)
  554. # arr_mean = np.mean(item_list) # 去最大值后的均值
  555. # arr_var = np.var(item_list) # 去最大值后的方差
  556. # max_v = max(item_list)
  557. # # print("均值与方差:",arr_mean,arr_var)
  558. # if abs((item_list[-1] - item_list[0] + 1) - len(item_list)) <= 3:
  559. # return item_list
  560. # else:
  561. # exception_value = []
  562. # for i in item_list:
  563. # # print(abs((i - arr_mean) / arr_var), i)
  564. # if(abs((i - arr_mean)/arr_var)) > 0.3:
  565. # exception_value.append(i)
  566. # right_seq = [i for i in item_list if i not in exception_value]
  567. # return right_seq
  568. def pic_transfer(con_list):
  569. aft_opt = [] # 针对选项后是题目图片的情况,进行移位
  570. if "\n" in con_list[-1]:
  571. ccon = re.split("\n+", con_list[-1])
  572. while re.match("<img src=", ccon[-1]) and len(ccon) > 1:
  573. aft_opt.insert(0, ccon[-1])
  574. ccon = ccon[:-1]
  575. if aft_opt:
  576. con_list[0] += "\n" + "\n".join(aft_opt)
  577. con_list[-1] = "\n".join(ccon)
  578. con_list[0] = re.sub(r"\(\d+分\)", "", con_list[0][:9]) + con_list[0][9:]
  579. return con_list
  580. def judge_split_error(item_list):
  581. """
  582. 转对试卷切分后的小题判断是否存在切分错误的情况,能纠错就纠错,不能则删除
  583. :return:
  584. """
  585. # for k, v in enumerate(item_list):
  586. # if k>0 and v['item_id'] - item_list[k-1]['item_id']>1:
  587. # if
  588. if __name__ == '__main__':
  589. # -------------生成requirements.txt---------------
  590. # pip freeze > requirements.txt
  591. # import os, sys
  592. #
  593. # project_root = os.path.dirname(os.path.realpath(__file__)) # 找到当前目录
  594. # print(project_root)
  595. #
  596. # # 找到解释器,虚拟环境目录
  597. # python_root = sys.exec_prefix
  598. # print(python_root)
  599. #
  600. # # 拼接生成requirements命令
  601. # command = python_root + '\Scripts\pip freeze > ' + project_root + '\\requirements.txt'
  602. # print(command)
  603. #
  604. # # 执行命令。
  605. # os.system(command)
  606. # ----------------一键安装 requirements.txt------------
  607. # pip install -r requirement.txt
  608. # python_root + '\Scripts\' + pip install -r requirements.txt
  609. ans_no0=[16, 17, 18, 19, 20]
  610. print(ans_no0[ans_no0.index(1):])
  611. # # b = del_exception_value(a)
  612. # print(b)
  613. # import os
  614. # rrr=os.path.basename(r"http:/pstatic.dev.xueping.com/data/word/2020/08/12/5f338d18e2cce.docx")
  615. # print(rrr)