school_dup_logic.py 3.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. import time
  2. from config import mongo_coll_school, log_msg
  3. from mysql_operate import mysql_operate
  4. class School_Dup_Logic():
  5. def __init__(self, logger):
  6. self.logger = logger
  7. # 校本题库查重逻辑判断
  8. def __call__(self, hnsw_process, retrieve_list):
  9. # 先进行云题库查重, 再进行校本题库查重
  10. cloud_list, school_list = hnsw_process(retrieve_list, hnsw_index=1)
  11. # 返回字典
  12. res_dict = dict()
  13. # 遍历retrieve_list
  14. for i,data in enumerate(retrieve_list):
  15. topic_id = data["topic_id"]
  16. res_dict[topic_id] = {'school_id': data['school_id']}
  17. # 先进行云题库查重结果标注判断,若不满足条件,则再进行校本题库查重结果标注判断
  18. # 云题库查重结果标注判断
  19. ele_list = self.math_mark_judge(cloud_list[i])
  20. if len(ele_list) > 0:
  21. res_dict[topic_id]["similar_topic_id"] = ele_list[0]
  22. res_dict[topic_id]["similar_score"] = ele_list[1]
  23. # 日志采集
  24. self.logger.info(log_msg.format(id=topic_id,
  25. type="chc查重",
  26. message="云题库{}已标注".format(topic_id)))
  27. else:
  28. # 校本题库标注预处理
  29. school_list1, school_list2 = [], []
  30. for ele in school_list[i]:
  31. school_data = mongo_coll_school.find_one({"topic_id": ele[0]})
  32. if "save_time" in school_data:
  33. if school_data["save_time"] <= (time.time() - 7 * 24 * 3600):
  34. school_list1.append(ele)
  35. elif school_data["save_time"] > (time.time() - 7 * 24 * 3600):
  36. school_list2.append(ele)
  37. else:
  38. school_list1, school_list2 = school_list[i], school_list[i]
  39. break
  40. # 校本题库查重结果标注判断
  41. ele_list = self.math_mark_judge(school_list1)
  42. if len(ele_list) > 0:
  43. res_dict[topic_id]["similar_topic_id"] = ele_list[0]
  44. res_dict[topic_id]["similar_score"] = ele_list[1]
  45. # 日志采集
  46. self.logger.info(log_msg.format(id=topic_id,
  47. type="chc查重",
  48. message="校本题库{}已标注".format(topic_id)))
  49. # 若无标注结果,则返回相似度最高的查重结果
  50. elif len(school_list2) > 0:
  51. first_dup_list = school_list2[0]
  52. res_dict[topic_id]["similar_topic_id"] = first_dup_list[0]
  53. res_dict[topic_id]["similar_score"] = first_dup_list[1]
  54. # 若查重无结果,则返回空字符串
  55. elif len(school_list2) == 0:
  56. res_dict[topic_id]["similar_topic_id"], res_dict[topic_id]["similar_score"] = '', ''
  57. return res_dict
  58. # mysql数据查询函数
  59. def mysql_fetch(self, topic_id):
  60. try:
  61. # sql数据查询语句
  62. fetch_sql = "select is_owned,is_mark from task_teacher_topic where topic_id=%d" % (int(topic_id))
  63. fetch_dict = mysql_operate(fetch_sql)
  64. return fetch_dict
  65. except Exception as e:
  66. return None
  67. # 判断校本题目是否标注
  68. def math_mark_judge(self, sim_list):
  69. # 遍历查重返回列表
  70. for ele_list in sim_list:
  71. # 数据查询
  72. fetch_dict = self.mysql_fetch(ele_list[0])
  73. # 判断数据返回状态
  74. if fetch_dict is not None and len(fetch_dict) > 0:
  75. if fetch_dict.get("is_mark", 0) == 1 or fetch_dict.get("is_owned", 0) == 1:
  76. return ele_list
  77. return []