split_topic_en.py 901 B

123456789101112131415161718192021222324252627282930313233343536
  1. inf_words_dict = dict()
  2. with open("./segment/ocr/type_config.txt", "r", encoding="utf-8") as f:
  3. for i, line in enumerate(f):
  4. if line.startswith("#"):
  5. continue
  6. line = line.strip().replace(":", ":").replace(",", ",")
  7. key, val = line.split(":")
  8. key = key.strip()
  9. val = val.split(",")
  10. val = tuple(v.strip() for v in val)
  11. inf_words_dict[val] = key
  12. # 答案冒号 = "答案:"
  13. # 解析冒号 = "解析:"
  14. def could_skip_line(line):
  15. '''对于答案和解析行,不进行type_inf'''
  16. return line.startswith("答案:") or line.startswith("解析:")
  17. def contains_all(s, words):
  18. return all([w in s for w in words])
  19. def topic_type_line(line):
  20. if could_skip_line(line):
  21. return False
  22. for key, val in inf_words_dict.items():
  23. if contains_all(line, key):
  24. return True
  25. return False