formula_process.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. import re
  2. import json
  3. import pickle
  4. import numpy as np
  5. from sklearn.feature_extraction.text import CountVectorizer
  6. import config
  7. # 公式生成
  8. def formula_generate(fstr, formula_list):
  9. # 对'l'和'1'进行特殊处理
  10. fstr = fstr.replace('l0', '10') if 'l0' in fstr else fstr
  11. # 排除"1)"、"10安"
  12. if re.search(r"[+\-·⋅\*/=<>]", fstr) is None or \
  13. re.search(r"[不的为是由选在约到和或与时连接于做天表日其]", fstr) or \
  14. re.search(r"^[\d\-].*?[\u4e00-\u9fff]", fstr) or \
  15. re.search(r"^\d+\)", fstr) or re.search(r"^[a-z]\)", fstr) or fstr[-1] == '-':
  16. return '', formula_list
  17. # "()"内容特殊处理
  18. bstr_list = re.findall(r"\(.*?\)", fstr)
  19. for bstr in bstr_list:
  20. if len(re.findall(r"[\u4e00-\u9fff]", bstr[1:-1])) > 2:
  21. fstr = fstr.replace(bstr, '')
  22. # 公式判断处理
  23. # '.'单独处理
  24. fstr = fstr[:-1] if fstr[-1] == '.' else fstr
  25. r_fstr = re.sub(r"[\.·⋅x\*/\)\'\^]", '', fstr)
  26. if len(r_fstr) > 2:
  27. # 排除10N/kg, 10km/h之类公式
  28. if re.sub(r"^\d+[A-Za-z°%μΩθδεπαφ∅∮βωλγνυηωс∁]+\d*$", '', r_fstr) == '':
  29. pass
  30. # 排除数字公式
  31. elif r_fstr.isdigit() or re.sub(r"^-", '', fstr).isdigit():
  32. pass
  33. else:
  34. # 判断左右括号数量进行特殊处理
  35. lb, rb = 0, 0
  36. for c in fstr:
  37. if c == '(': lb += 1
  38. elif c == ')': rb += 1
  39. if rb > lb and fstr[-1] == ')':
  40. fstr = fstr[:-1]
  41. elif rb > lb and fstr[-1] != ')':
  42. fstr = '(' + fstr
  43. formula_list.append(fstr.lower())
  44. return '', formula_list
  45. # 公式识别
  46. def formula_recognize(formula_string):
  47. formula_list = []
  48. fstr = ''
  49. brackets_flag = False
  50. formula_length = len(formula_string)
  51. for i,char in enumerate(formula_string):
  52. # 获取公式首字符
  53. if fstr == '' and re.search(r"[A-Za-z\dρμθδεαφ∅∮βωλγυνΩηωс∁]", char):
  54. fstr += char
  55. elif fstr != '':
  56. # 对括号进行特殊处理
  57. if brackets_flag == False and char == '(':
  58. bracket_char = fstr[-1]
  59. brackets_flag = True
  60. elif brackets_flag == True and char == ')':
  61. brackets_flag = False
  62. # "()"内容特殊处理
  63. bstr_list = re.findall(r"\(.*?\)", fstr+')')
  64. if len(bstr_list) > 0 and bstr_list[-1][1:-1].isdigit() and \
  65. bracket_char.isalpha() is False:
  66. fstr = fstr.replace(bstr_list[-1][:-1], '')
  67. fstr, formula_list = formula_generate(fstr, formula_list)
  68. continue
  69. # 括号要特殊处理
  70. if brackets_flag == True:
  71. fstr += char
  72. continue
  73. # 对公式尾字符进行判断
  74. if i == formula_length - 1:
  75. if re.search(r"[^\.,;、!?:#\'\"_]", char):
  76. fstr += char
  77. fstr, formula_list = formula_generate(fstr, formula_list)
  78. # 若当前字符为标点符号, 则判定公式识别结束
  79. elif re.search(r"[,;、!?:#]", char):
  80. fstr, formula_list = formula_generate(fstr, formula_list)
  81. # 对'.'进行特殊处理, 若'.'之前或之后非数字, 则判定公式识别结束
  82. elif char == '.' and re.sub(r"^\d\.\d$", '', formula_string[i-1:i+2]) != '':
  83. fstr, formula_list = formula_generate(fstr, formula_list)
  84. # 若当前字符为汉字后续字符为数字或当前字符为数字后续字符为汉字, 则判定公式识别结束
  85. elif re.search(r"^([\u4e00-\u9fff]\d|\d[\u4e00-\u9fff])", formula_string[i:i+2]):
  86. fstr, formula_list = formula_generate(fstr, formula_list)
  87. # 若当前字符为加减乘除等于符号, 后一个字符为汉字或标点符号, 则判定公式识别结束
  88. elif re.search(r"[\+\-·⋅x\*/=<>≈∽∼]", char) and \
  89. re.search(r"[\u4e00-\u9fff\.,;、!?:#\'\"]", formula_string[i+1:i+2]):
  90. fstr, formula_list = formula_generate(fstr, formula_list)
  91. # 若当前字符为汉字后续两个字符为非加减乘除等于符号, 则判定公式识别结束
  92. # re.sub(r"^[\u4e00-\u9fff][\u4e00-\u9fff\.,;、!?:#\'\"]*", '', formula_string[i:i+3]) == ''
  93. elif re.sub(r"^[\u4e00-\u9fff][^A-Za-z\+\-·⋅x\*/=<>≈∽∼]*", '', formula_string[i:i+3]) == '':
  94. if re.search(r"[甲乙丙丁有总额放吸电动浮排液物实]", char):
  95. fstr += char
  96. fstr, formula_list = formula_generate(fstr, formula_list)
  97. else:
  98. fstr += char
  99. return list(set(formula_list))
  100. # mongodb公式处理初始化
  101. def formula_init():
  102. # 获取mongodb数据
  103. origin_dataset = config.mongo_coll.find(no_cursor_timeout=True, batch_size=5)
  104. # 将公式组合成字典形式{"f=ma": [123, 456]}
  105. formula_dict = dict()
  106. for data in origin_dataset:
  107. if "content_clear" not in data:
  108. continue
  109. formula_list = formula_recognize(data["content_clear"])
  110. if len(formula_list) == 0:
  111. continue
  112. for formula in formula_list:
  113. if formula not in formula_dict:
  114. formula_dict[formula] = [data["id"]]
  115. else:
  116. formula_dict[formula].append(data["id"])
  117. # 将公式字典转化为列表形式用于通过索引获取元素
  118. formula_id_list = [[formula, id_list] for formula,id_list in formula_dict.items()]
  119. # 词袋模型初始化
  120. bow_model = CountVectorizer(token_pattern=r"[^ ]")
  121. all_formula_list = [ele[0] for ele in formula_id_list]
  122. bow_vector = bow_model.fit_transform(all_formula_list)
  123. # 保存词袋模型数据
  124. with open(config.bow_model_path, "wb") as bm:
  125. pickle.dump(bow_model, bm)
  126. np.save(config.bow_vector_path, bow_vector.toarray().astype("float32"))
  127. with open(config.formula_data_path, 'w', encoding='utf8', errors='ignore') as f:
  128. json.dump(formula_id_list, f, ensure_ascii=False)
  129. if __name__ == "__main__":
  130. # text = '已知c水=4.2×103J/(kg·℃),求-10'
  131. # text = "水的密度:ρ=1.0×10kg/m3是"
  132. # text = "'金宜高速'是经过河池市政府所在地的第一条高速公路,王华一家从金城江到刘三姐故里宜州自驾旅行,单程共消耗汽油5*kg.小车总重为1.5*10^4*N,静止在水平地面上时轮子与地面接触的总面积为0.15*m^2,(汽油的热值q=4.6*10^7J/kg).求:(1)小车静止时对水平地面的压强;(2)5*kg汽油完全燃烧放出的热量,"
  133. # text = "p蜡=0.9*10^3Kq/m^3"
  134. # text = "在一个案件中,公安人员在海滩案发现场发现了罪犯留下的清晰的双脚站立脚印,立即用蜡浇灌了一只鞋模.测量鞋模的平均厚度为3*cm,质量675*g,又经测试达到脚印同样深度的压强为1.5*10^4*Pa,请你帮助公安人员计算出罪犯的体重为多少?(ρ蜡=0.9*10^3*kg/m^3)"
  135. # for i in range(1):
  136. # print(formula_recognize(text))
  137. # mongodb公式处理初始化
  138. formula_init()