Explorar o código

结构化局部修改,换行符清洗修改

cdZWj hai 5 meses
pai
achega
953fa7a509

+ 11 - 9
configs.py

@@ -6,6 +6,7 @@ import time
 import os
 import sys
 import datetime
+import logging.handlers
 
 
 class myLog(object):
@@ -32,15 +33,16 @@ class myLog(object):
         # self.log_name = self.log_path + "/" + log_cate + "." + self.log_time + '.log'
         # self.log_name = os.path.join(log_dir, 'parse_log.log')  # 日志地址
         self.log_name = os.path.join(parse_log_dir, '{}.log'.format(log_cate))  # 日志地址
-        if os.path.exists(self.log_name):  # 设置日志定长自动新建
-            logsize = os.path.getsize(self.log_name)
-            if logsize > 180000000:  # 180M
-                os.rename(self.log_name, os.path.join(parse_log_dir, '{}_{}.log'.format(log_cate,
-                          datetime.datetime.now().strftime('%m_%d'))))
-
-        # fh = logging.FileHandler(self.log_name, 'a')  # 追加模式  这个是python2的
-        fh = logging.FileHandler(self.log_name, mode='a', encoding='utf-8', delay=True)
-        # fh = logging.FileHandler(self.log_name, 'a', encoding='utf-8')  # 这个是python3的
+        # if os.path.exists(self.log_name):  # 设置日志定长自动新建
+        #     logsize = os.path.getsize(self.log_name)
+        #     if logsize > 180000000:  # 180M
+        #         os.rename(self.log_name, os.path.join(parse_log_dir, '{}_{}.log'.format(log_cate,
+        #                   datetime.datetime.now().strftime('%m_%d'))))
+
+        # fh = logging.FileHandler(self.log_name, mode='a', encoding='utf-8', delay=True)
+        fh = logging.handlers.RotatingFileHandler(self.log_name, maxBytes=150000000, backupCount=3,
+                                                  mode='a', encoding='utf-8', delay=True)
+        
         fh.setLevel(logging.INFO)
 
         # 再创建一个handler,用于输出到控制台

+ 18 - 2
structure/danti_structure.py

@@ -3,6 +3,9 @@
 """
 单题再解析、结构化
 """
+import sys
+sys.path.append(r"F:\zwj\Text_Structure\new_tiku_structure_v3_art")
+
 import re
 
 from structure.option import option_structure
@@ -38,6 +41,7 @@ def single_parse(one_item, item_type, wordid, subject):
     one_item = re.sub(r"\\\(\s*{\s*\}\s*\\\)", "", one_item)
     one_item = re.sub(r"\\\(\s*\\\)", "", one_item)
     one_item = re.sub(r"\\\(|\\\)", "$", one_item)
+    one_item = re.sub(r"<br\s*/?>", "\n", one_item)  # 20240621
     one_item = re.sub(r"【\d{1,2}题详解】", "【详解】", one_item)
 
     res_list = re.split(r"(\n+【答案】|\n+【解析】)", one_item)
@@ -369,8 +373,20 @@ if __name__ == '__main__':
 <p>【3题详解】</p>
 <p>本题考查学生分析概括作者的观点态度的能力。A.“其中认识是最重要的”说法过于绝对,原文为“第四,也许是最重要的,上述知识、见识和胆识加在一起,形成最后一个‘识’:认识”,说的是也许是最重要的。B.“今天的大学,缺少胆识教育,学生缺乏‘卒然临之而不惊,无故加之而不怒’的大勇”于文无据,原文只说“今天的大学,还要在一定程度上加强学生的‘胆识’教育”,并没有说今天的大学,缺少胆识教育,也没有说学生缺乏“卒然临之而不惊,无故加之而不怒”的大勇。C.“其根本原因就是要实现孟子所倡导的仁爱”说法错误,原文为“实际上,这恰恰是现代教育特别重视培养孩子的社交能力的原因所在”,并没有说是根本原因。故选D。</p>
        """
-
-    aa = single_parse(str(hml4), "小题多问类", "62b269e76c6aff227934605e", "语文")
+    html5 = """
+<div class="stem-wraper" data-v-4baef543=""><span class="stem" data-v-4baef543="">下列各组的句子中,加点词语的意思不同的一项是( )</span></div>
+<ul class="stem-options" data-v-4baef543="">
+<li data-v-4baef543=""><span class="analysis-prefix" data-v-4baef543="">A.</span><span data-v-4baef543="">忆往昔<span class="dots">峥</span><span class="dots">嵘</span><span class="dots">岁</span><span class="dots">月</span>稠。<br>这位老人回忆起长征时的<span class="dots">峥</span><span class="dots">嵘</span><span class="dots">岁</span><span class="dots">月</span>,总是有讲不完的故事。</span></li>
+<li data-v-4baef543=""><span class="analysis-prefix" data-v-4baef543="">B.</span><span data-v-4baef543="">恰同学少年,<span class="dots">风</span><span class="dots">华</span><span class="dots">正</span><span class="dots">茂</span>。<br><span class="dots">风</span><span class="dots">华</span><span class="dots">正</span><span class="dots">茂</span>时当努力奋斗,莫待迟暮时后悔不及。</span></li>
+<li data-v-4baef543=""><span class="analysis-prefix" data-v-4baef543="">C.</span><span data-v-4baef543=""><span class="dots">书</span><span class="dots">生</span><span class="dots">意</span><span class="dots">气</span>,挥斥方道。<br>我刚毕业的时候,没有实践经验,不通世事人情,办事教条化,难免<span class="dots">书</span><span class="dots">生</span><span class="dots">意</span><span class="dots">气</span>。</span></li>
+<li data-v-4baef543=""><span class="analysis-prefix" data-v-4baef543="">D.</span><span data-v-4baef543="">指点江山,<span class="dots">激</span><span class="dots">扬</span>文字。<br>有关部门应监督、管理这些网络平台,<span class="dots">激</span><span class="dots">浊</span><span class="dots">扬</span><span class="dots">清</span>,营造向上向善的网络氛围。</span></li>
+</ul>
+<div class="topic-analysis" data-v-4baef543="">
+<div class="topic-analysis-content" data-v-4baef543=""><span class="analysis-prefix" data-v-4baef543="">【答案】</span><span data-v-4baef543="">C</span></div>
+<div class="topic-analysis-content" data-v-4baef543=""><span class="analysis-prefix" data-v-4baef543="">【解析】</span><span data-v-4baef543="">青年学生踌躇满志,意气奔放;书呆子性情,不知变通。</span></div>
+</div>
+"""
+    aa = single_parse(str(html5), "选择类", "6673eeb2c3c4da9e7009b739", "语文")
     pprint(aa)
     #
 

+ 1 - 0
structure/option.py

@@ -36,6 +36,7 @@ def option2block(option_con, item_no_type):
     if item_no_type == 2:
         con = re.sub(r"\n\s*\(([A-Hc])\)\s*[、、..]?(.+?)", r"\n【【\1、】】\2", option_con)
 
+    con = con.replace("</table>【【", "</table>\n【【")
     if item_no_type == 1:
         if len(re.findall(r'【【[A-H]\s*[..、、]】】', con)) <= 3:
             while re.search(r"\n\s*[A-H]\s*<img\s*src=.+?", con.replace(" ", "")):  # 2020/7/15

+ 4 - 1
structure/structure_main.py

@@ -1,6 +1,8 @@
 #!/usr/bin/env/python
 # -*- coding:utf-8 -*-
 
+import sys
+sys.path.append(r"F:\zwj\Text_Structure\new_tiku_structure_v3_art")
 
 from pprint import pprint
 # from utils.exam_type import get_exam_type
@@ -219,7 +221,7 @@ if __name__ == '__main__':
     # 627b64b0814132f0d7b12589    627b622981b582c0470d020e
     # 6294326cf84c0e279ac6484e.html   62903acaf84c0e279ac647fb
     path2 = r"C:\Users\Administrator\Desktop\666a67fec3c4da9e7009b531.html"
-    path2 = r"C:\Users\Administrator\Desktop\66459c62c3c4da9e7009ae9d.html"
+    path2 = r"C:\Users\Administrator\Desktop\6667f5a8c3c4da9e7009b44b.html"
     # path2 = r"F:\zwj\Text_Structure\accept_files\62aae86a765759d85567a475.html"
     html = open(path2, "r", encoding="utf-8").read()
     # html = json.loads(html)  621845626ca622396925f55c
@@ -244,6 +246,7 @@ if __name__ == '__main__':
     # for i in res1[0]["items"]:
     #     re_f.write(str(i))
     pprint(res1[0]["items"])
+    pprint(res1[0]["html"])
     print('题目数量:', len(res1[0]["items"]))
 
     # new_fpath = r"F:\zwj\Text_Structure\new_tiku_structure_2021\res_folder\10-28.json"

+ 37 - 37
structure/three_parse_structure.py

@@ -385,43 +385,43 @@ def split_by_topicno(con_list, subject, is_dati=0):
     res = resplit(res)
     # pprint(res)
     # 对最后一个题后面带个别答案(无答案页)
-    if res:
-        pattern1 = re.search('\n\s*([1-9]|[1-9][0-9])\s*[..、、]\s*(解\s*[::]|【解析|【答案)', res[-1]["stem"])
-        if pattern1:
-            breakp = pattern1.start()
-            ans_str = res[-1]["stem"][breakp:]
-            ans_no_info = pre_get_item_no(ans_str, item_no_type)
-            ans_no, ans_no_idx = get_right_no(ans_no_info)
-            all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])]
-            res[-1]["stem"] = res[-1]["stem"][:breakp]
-            res = get_ans_match(res, all_ans, ans_no)
-        else:
-            ans_str = res[-1]["stem"] + res[-1]["parse"]
-            ans_no_info = pre_get_item_no(ans_str, item_no_type)
-            ans_no, ans_no_idx = get_right_no(ans_no_info)
-            if len(ans_no) == len(res):
-                all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])]
-                res[-1]["stem"] = res[-1]["stem"][:ans_no_idx[0]]
-                res = get_ans_match(res, all_ans, ans_no)
-            elif ans_no_idx:
-                try:
-                    ans_no1, table_ans, st = get_table_ans(res[-1]["stem"][:ans_no_idx[0]], [], flag=1)
-                    if table_ans and 0 < ans_no[0] - ans_no1[-1] < 3:
-                        all_ans = table_ans
-                        all_ans.extend([del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])])
-                        new_ans_no = ans_no1
-                        new_ans_no.extend(ans_no)
-                        if st >= 0:
-                            res[-1]["stem"] = res[-1]["stem"][:st]
-                        else:
-                            res[-1]["stem"] = res[-1]["stem"][:ans_no_idx[0]]
-                        res = get_ans_match(res, all_ans, new_ans_no)
-                except:
-                    if len(ans_no) > 4 and all([True if not one_res["key"] and not one_res["parse"]
-                                                    else False for one_res in res[:-1]]):
-                        all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])]
-                        res[-1]["stem"] = res[-1]["stem"][:ans_no_idx[0]]
-                        res = get_ans_match(res, all_ans, ans_no)
+    # if res:
+    #     pattern1 = re.search('\n\s*([1-9]|[1-9][0-9])\s*[..、、]\s*(解\s*[::]|【解析|【答案)', res[-1]["stem"])
+    #     if pattern1:
+    #         breakp = pattern1.start()
+    #         ans_str = res[-1]["stem"][breakp:]
+    #         ans_no_info = pre_get_item_no(ans_str, item_no_type)
+    #         ans_no, ans_no_idx = get_right_no(ans_no_info)
+    #         all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])]
+    #         res[-1]["stem"] = res[-1]["stem"][:breakp]
+    #         res = get_ans_match(res, all_ans, ans_no)
+    #     else:
+    #         ans_str = res[-1]["stem"] + res[-1]["parse"]
+    #         ans_no_info = pre_get_item_no(ans_str, item_no_type)
+    #         ans_no, ans_no_idx = get_right_no(ans_no_info)
+    #         if len(ans_no) == len(res):
+    #             all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])]
+    #             res[-1]["stem"] = res[-1]["stem"][:ans_no_idx[0]]
+    #             res = get_ans_match(res, all_ans, ans_no)
+    #         elif ans_no_idx:
+    #             try:
+    #                 ans_no1, table_ans, st = get_table_ans(res[-1]["stem"][:ans_no_idx[0]], [], flag=1)
+    #                 if table_ans and 0 < ans_no[0] - ans_no1[-1] < 3:
+    #                     all_ans = table_ans
+    #                     all_ans.extend([del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])])
+    #                     new_ans_no = ans_no1
+    #                     new_ans_no.extend(ans_no)
+    #                     if st >= 0:
+    #                         res[-1]["stem"] = res[-1]["stem"][:st]
+    #                     else:
+    #                         res[-1]["stem"] = res[-1]["stem"][:ans_no_idx[0]]
+    #                     res = get_ans_match(res, all_ans, new_ans_no)
+    #             except:
+    #                 if len(ans_no) > 4 and all([True if not one_res["key"] and not one_res["parse"]
+    #                                                 else False for one_res in res[:-1]]):
+    #                     all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])]
+    #                     res[-1]["stem"] = res[-1]["stem"][:ans_no_idx[0]]
+    #                     res = get_ans_match(res, all_ans, ans_no)
 
     # 没有识别出答案切分点的情况,很可能答案里的部分也当成题文进行拆分,所以先判断下是否有相同的id
     all_no = [one_res['item_id'] for one_res in res]

+ 4 - 2
utils/html_again_parse.py

@@ -89,7 +89,7 @@ def css_label_wash(content):
         if html.children():
             # temph = [str(i) for i in html.children().items()]
             for line in html.children().items():  # <p>.*?</p>里面的内容可能会被过滤掉
-                # test = str(line)  # line.text()
+                test = str(line)  # line.text()
                 # 保留下划线及着重符标签   <span style="text-decoration: underline;">
                 # 波浪线:<span style="text-decoration: underline wavy;">
                 # pq会将多个空格换成一个
@@ -180,7 +180,9 @@ def css_label_wash(content):
         if subs2img:
             new_a = re.sub("|".join(subs2img.keys()), lambda x: subs2img[x.group()], new_a)
         # new_a = "<p>" + new_a.replace("\n\n", "\n").replace("\n", "</p>\n<p>") + "</p>"
-        new_a = "<p>" + new_a.replace("\n\n", "\n") + "</p>"
+        new_a = re.sub("(?<!</p>)\s*\n", "<br/>",  new_a.replace("\n\n", "\n"))  # 2024.6.13
+        new_a = re.sub("<br/>(\n|<br/>)+", "<br/>", new_a)
+        new_a = "<p>" + new_a + "</p>"
         new_a = re.sub(r'<p>(<p (class|style)=.+?)</p>$', r"\1", new_a, flags=re.S)
         # for sb, img in subs2img.items():  # 2021
         #     new_a = new_a.replace(sb, img)