Sfoglia il codice sorgente

add html中<br>的清洗

cdZWj 5 mesi fa
parent
commit
7483661600
2 ha cambiato i file con 11 aggiunte e 5 eliminazioni
  1. 7 4
      structure/structure_main.py
  2. 4 1
      utils/html_again_parse.py

+ 7 - 4
structure/structure_main.py

@@ -1,6 +1,7 @@
 #!/usr/bin/env/python
 # -*- coding:utf-8 -*-
 
+
 from pprint import pprint
 # from utils.exam_type import get_exam_type
 from structure.final_structure import one_item_structure
@@ -13,6 +14,7 @@ from func_timeout import func_set_timeout
 
 from utils.xuanzuoti2slave import toslave_bef, toslave_aft
 
+
 paper_types = ["第三种试卷格式:题目与答案分开",
                "第二种试卷格式: 不同时含有或都不含有{答案}和{解析}关键字",
                "第一种试卷格式:教师用卷,含答案和解析关键字"]
@@ -216,11 +218,12 @@ if __name__ == '__main__':
     #    6239991e6ca622396925f66b     624cf82d12cd45a7836f3430  626b4b1f81b582c0470d01b0
     # 627b64b0814132f0d7b12589    627b622981b582c0470d020e
     # 6294326cf84c0e279ac6484e.html   62903acaf84c0e279ac647fb
-    path2 = r"C:\Users\Python\Desktop\62d8eaaa6c6aff2279346c1e.html"
+    path2 = r"C:\Users\Administrator\Desktop\666a67fec3c4da9e7009b531.html"
+    path2 = r"C:\Users\Administrator\Desktop\66459c62c3c4da9e7009ae9d.html"
     # path2 = r"F:\zwj\Text_Structure\accept_files\62aae86a765759d85567a475.html"
-    # html = open(path2, "r", encoding="utf-8").read()
+    html = open(path2, "r", encoding="utf-8").read()
     # html = json.loads(html)  621845626ca622396925f55c
-    html = """
+    html2 = """
 1. I’m anxious___________ your injury.Are you feeling any better now?
 2. After he was back on his feet, he was anxious___________ (return) to school as soon as possible.
 3. Helen was ___________ to death when she saw the ___________scene.She hid herself in the corner, shaking with___________(fright).
@@ -234,7 +237,7 @@ if __name__ == '__main__':
 11. He has worked for nearly 20 years, so he is senior ___________ most of his workmates.
 12. Although he is three years junior ___________ me, he has more work experience.
     """
-    res1 = StructureExporter(html, "", "语文", 1).export()
+    res1 = StructureExporter(html, "202406131725", "语文", 1).export()
     # new_fpath = os.path.join(r"G:\zwj\WL\Text_Structure\fail_files", "res_政治.json")
     # re_f = open(new_fpath, 'w', encoding='utf-8')
     # json.dump(res1[0]["items"], re_f, ensure_ascii=False)

+ 4 - 1
utils/html_again_parse.py

@@ -87,8 +87,9 @@ def css_label_wash(content):
         a = []
 
         if html.children():
+            # temph = [str(i) for i in html.children().items()]
             for line in html.children().items():  # <p>.*?</p>里面的内容可能会被过滤掉
-                test = line.text()
+                # test = str(line)  # line.text()
                 # 保留下划线及着重符标签   <span style="text-decoration: underline;">
                 # 波浪线:<span style="text-decoration: underline wavy;">
                 # pq会将多个空格换成一个
@@ -160,6 +161,8 @@ def css_label_wash(content):
                 elif str(line).startswith("<ol"):
                     for i, ss in enumerate(line.children().items()):
                         a.append(str(i + 1) + "." + ss.text())
+                elif str(line).startswith("<br>") or str(line).startswith("<br/>"):
+                    a.append(str(line))
                 else:
                     # print('test:',line.text())  # 自动去掉了图片
                     if line.text().strip():