# -*- coding: utf-8 -*- import re import regex as rex from math_reg_sever.is_double_kuohao import is_double_kuohao def find_serial_sm(one_mp_item): """ 找出其中连续的数学符号序列 :param one_mp_item: str,清洗后的,公式为maple的字符串,空字符串替换以后!!! :return: serial_sm:str,如果不存在这样的序列,则返回空字符串 """ serial_sm = "" # 连续的数学符号 serial_sm_start = -1 serial_sm_end = -1 # item_kuo = is_double_kuohao(one_mp_item) #该题目本身括号是否正常 for i, c in enumerate(one_mp_item): if serial_sm_end != -1 and i - serial_sm_end > 1: break b1 = re.match("[^#::??。∵∴..,,;;、ⅠⅡⅢⅣⅤ①②③④⑤\u4e00-\u9fa5]", c) b2 = (re.match(r"[,,;;::\u4e00-\u9fa5]", c) and not is_double_kuohao(serial_sm)) # <---整个题目本身括号如果不正常,就把,;:当分隔符分断 b3 = re.match(r"[..]", c) and len(serial_sm) > 0 and re.match(r"\d", serial_sm[-1]) if b1 or b2 or b3: if serial_sm_start == -1: # 数学公式一定是以大小写字母,数字,({<开头,其中<是向量那一块的 if re.match(r"[({[<\da-zA-Z∠-△▵Δ∁ρθαβγφδεξλη|]", c): serial_sm_start = i serial_sm_end = serial_sm_start serial_sm += c else: if i == serial_sm_end + 1: serial_sm_end += 1 serial_sm += c else: # serial_sm = "" break # ########是否为公式筛选###### re_filter = False if sm_filter(serial_sm): serial_sm = "犇" # 不能置为"",如果置为"",下面循环找serial_sm的时候,就不会继续找后面的了 else: m = re.search(r"(^\([1-9]\))[^+\-*/^].*?", serial_sm) if m: serial_sm = serial_sm.replace(m.group(1), "") serial_sm_start = serial_sm_start + 3 else: m = re.search(r"[..](\([1-9]\))$", serial_sm) if m: serial_sm = serial_sm.replace(m.group(1), "") serial_sm_end = serial_sm_end - 3 if len(serial_sm) > 0: if re.match(r"[,,;;::.。.??]", serial_sm[-1]): serial_sm = serial_sm[:-1] serial_sm_end = serial_sm_end - 1 re_filter = True if len(serial_sm) > 0: if re.match(r"[,,;;::.。.??]", serial_sm[0]): serial_sm = serial_sm[1:] serial_sm_start = serial_sm_start + 1 re_filter = True if re_filter and sm_filter(serial_sm): # 经过上面处理后,再筛选一下 serial_sm = "犇" # 不能置为"",如果置为"",下面循环找serial_sm的时候,就不会继续找后面的了 return sm_correct(serial_sm, serial_sm_start, serial_sm_end) # 原题目,提取的公式,公式始、终索引 def sm_correct(serial_sm, serial_sm_start, serial_sm_end): """ 当前提取出来的连续字符序列可能有问题,需要纠正 1.由于公式中括号不成对,导致提取一直提取到题目结束:{{t|0<=t<=26},而后者的定义域为R.不相等.因为前者的定义域为R,而后者的定义域为{x|x<>0} <-- 先不解决 2.f(x)=2^x-a*x+7(a>0或a<-1)--->这种提取连在一起,影响后面符号识别: 对2的处理思路,认为以(.*?)$结束的公式,如果括号内没有汉字,只 :param serial_sm: :return: new_serial_sm,new_serial_sm_start,new_serial_sm_end """ # serial_sm = serial_sm.replace("<=","<").replace(">=",">") ss = re.search(r".+[^\+\-\*/sgtfFGhHT=∈∉](\(.+?\))$", serial_sm) # <--改成贪婪匹配模式,希望前面进了能多,只匹配最后一个满足条件的括号cos(x) sin(x) abs(x) log[x](a) sqrt(x) f(x) # ss2 = re.search(r'[)\d]\((.{1,6})\)$',serial_sm) #括号前面是右括号,或是个数字,再加上括号里面含有 # ?为啥第一个不能是括号,因为怕括号取过头了 # if len(serial_sm)>0 and ((serial_sm[0] != "(" and ss) or (ss2)) and re.search(r"[\u4e00-\u9fa5><=,]",ss.group(1)): if len(serial_sm) > 0 and ss and re.search(r"[\u4e00-\u9fa5><=∈∉,]", ss.group(1)): # ?为啥第一个不能是括号,因为怕括号取过头了,group(1)拿到范围更大的,而不是最后一个括号的内容 left_kuo = serial_sm.rfind(')(') if left_kuo == -1: left_kuo = serial_sm.rfind("(") else: left_kuo += 1 if not is_double_kuohao(serial_sm[:left_kuo]): left_kuo = serial_sm[:left_kuo].rfind("(") serial_sm_end = serial_sm_end - (len(serial_sm) - left_kuo) serial_sm = serial_sm[:left_kuo] return serial_sm, serial_sm_start, serial_sm_end def sm_filter(serial_sm): """ 提取出的连续字符序列,有很多情况不是公式,将不可能是公式的情况筛选出来 eg:1)单字符,但非汉字,非字母,非αβγ 2) :param serial_sm:提取出的连续字符序列,待验证数学公式 :return:True:不是数学公式,应该过滤掉/False:不应该过滤掉 """ b = False if len(serial_sm) == 1: if re.match(r"[^ραβγφδεξληθa-z\d]", serial_sm): b = True elif re.match(r"\([\u4e00-\u9fa5,,。??]+\)", serial_sm): # (删掉全部由汉字构成的带括号字符串) b = True elif len(re.findall(r"[\u4e00-\u9fa5]", serial_sm)) > 20: # (删掉全部由汉字构成的带括号字符串) b = True elif re.match(r"^\([1-5]\)$", serial_sm): b = True elif serial_sm == "img": b = True elif re.match(r"\(.+?\)", serial_sm): if re.search(r"[\u4e00-\u9fa5。??]", serial_sm[1:-1]) and \ re.search(r"[^><=]", serial_sm[1:-1]): # (a是2018年的产量) <--这个上面筛不掉 b = True elif re.search(r"\[[\u4e00-\u9fa5]", serial_sm) or \ re.search(r"[\u4e00-\u9fa5]\(", serial_sm): b = True elif not re.search(r"[\da-zA-Z]", serial_sm): # no digit and alpha->not formula b = True return b def formula_extract_lqy(line): """ 提取出句子中的公式,将对应公式用占位符#替换,提取的公式放在列表中 这步骤 :param line: :param :return:line:公式用占位符#替换以后的题目,res:提取出来的公式列表 """ res = [] line = str(line) if len(line) < 1000: # 可能有公式根本没有清洗,还是html格式 line = re.sub(r"\s", "", line) line = line.replace("⑴", "①"). \ replace("⑵", "②"). \ replace("⑶", "③"). \ replace("⑷", "④"). \ replace("<=", "<").\ replace("(", "(").\ replace(")", ")") # 不能直接替换,用为向量中,有很多=3这样的符号 line = rex.sub(r"(?=", ">", line) # ########否则AB//平面ABC会提取成---> AB// ABC def sub_line_plane(m): lp_dict = {"直线": "(line)", "平面": "(plane)", "底面": "(plane)", "面": "(plane)", "截面": "(plane)"} return m.group(1) + lp_dict[m.group(2)] line = re.sub(r"(//|⊂|⊥)(直线|平?面|底?面)", sub_line_plane, line) v_line = line has_serial_sm = True serial_sm_end = -1 while_num = 0 while has_serial_sm and while_num <= 50: while_num += 1 v_line = v_line[serial_sm_end + 1:] serial_sm, serial_sm_start, serial_sm_end = find_serial_sm(v_line) if len(serial_sm) > 0: line = line.replace(serial_sm, "#", 1) # 占位 if serial_sm != '犇': res.append(serial_sm) else: has_serial_sm = False # print("公式列表:{}".format(res)) # return line, res return res if __name__ == "__main__": line = " 已知点O为原点,点A,B的坐标分别为(a,0)和(0,a),其中a∈(0,+∞),点P在AB上且=t(0<=t<=1),则的最大值为" # line = "求下列函数的定义域: (1)y=(1/(log[3](3*x-2)));(2)y=log[a](2-x),(a>0,且a<>1);(3)y=log[a](1-x)^2,(a>0,且a<>1)" # line = "已知全集U=A∪B={x∈N|0<=x<=10} ,A∩(∁UB)={1,3,5,7} ,试求集合B ." # line = "A={(x,y)|x+y<=1,x∈N,y∈N} ,则x=0 或1 ,y=0 或1 ,所以A={(0,0),(0,1),(1,0)} ,元素的个数是3 故选C ." # line = "集合A={(x,y)|x+y<=1,x∈N,y∈N} 中元素的个数是(  )" # line = "范围是2<=x<=8.(1)则x是正整数" # line = "函数f(x)={x-2,x<2;f(x-1),x>=2} 则f(2)= (\u3000\u3000)." # line = " 二次函数y=a*x^2+b*x+c 的零点分别为-2,3 ,若x∈(-2,3) 时,f(x)<0 且f(-6)=36 ,则二次函数f(x) 的解析式为img ." # line = " 由AB=3 ,y) 则由点N是△ABC内部或边上一点可得,{0<=x<=3;0<=y<=4;4*x+3*y-12<=0} 则=(-2.x).=(α.b).||=((12)/5) 由AM⊥BC于M可知=0|=((12)/5)| 可得b=((36)/(25)).α=((48)/(25)) 令Z==((48*x+36*y)/(25)) ,从而转化为线性规划问题,求目标函数Z在平面区域△ABC内的最大值利用线性规划知识可得当过边界BC时将取得最大值,此时Z=((144)/(25)) 故选 A" # line = " 由题意{n*p=(5/2);n*p(1-p)=(5/4)⇒p=(1/2),n=5 ,所以P(X=1)=_^(C)p(1-p)^4=5*(1/(32))=(5/(32)) ." # # # # print(len(line)) # line = "O 、A 、B 是平面上的三点" # line = "求以下两点间的距离: (1)(4,5,6) ,(-7,3,11) ; (2)(1,2,2) ,(4,6,14) ; (3((1/3),(2/3),(2/3)),(-(1/3),-(2/3),(2/3)) ; (4)((1/3),(3/4),(4/5)),((5/6),(2/3),(3/(10))) ." # line = "矩形,PA⊥ 平面ABCD ,PB 、PD 与平面ABCD 所成角的正切值依次是1 、(1/2) ,AP=2 ,E 、F 依次是PB 、PC 的中点. (1)求证:PB⊥ 平面AEFD ; (2)求直线EC 与平面PAD 所成角的正弦值." # line = "已知▵ABC 中,=,=,= ,若=,+=0 ,则▵ABC 的形状为(  )" # line = "化简下列各式:(1)sqrt(3-2*sqrt(2))+root[3]((1-sqrt(2))^3)+root[4]((1-sqrt(2))^4);(2)sqrt(4*x^2+12*x+9)+sqrt(4*x^2-20*x+25)(-(3/2)<=x<=(5/2))." line = "根据《电动自行车通用技术条件》(GB17761)标准规定,电动自行车的最高时速应不大于20km/h,整车质量应不大于40kg,假设一成年人骑着电动自行车在平直的公路上按上述标准快速行驶时所受阻力是总重量的0.05倍,则电动车电机的输出功率最接近于(  )" print(formula_extract_lqy(line)) # print(sm_correct('=t(0<=t<=1)', 0, 50)) # print(find_serial_sm('a∈(0,+∞)')) # import pandas as pd # data = pd.read_excel(r"C:\Users\Administrator\Desktop\替换.xlsx") # maple_items_list = data["train"].tolist() # special_name = data["special_name"].tolist() # # maple_parse_list = data["parse"].tolist()[:20] # print("数据读取完毕......") # item_formula = [] # parse_formula = [] # zhanwei_items = [] # # # for item,parse in tqdm(zip(maple_items_list,maple_parse_list)): # for item in tqdm(maple_items_list): # # try: # # item = re.sub(r"[==]\s*(\(\s*\)|_{1,5}|(\s*))", "等于多少", item) # # item = re.sub(r"\(\s*[??]*\s*\)|(\s*[??]*\s*)$", "", item) # # except: # # item = item # ##########题文 # try: # zhanwei_item,formula_list_item = formula_extract_lqy(item) # except: # zhanwei_item = item # formula_list_item = [] # item_formula.append(formula_list_item) # zhanwei_items.append(zhanwei_item) # print("【成功】-----公式提取成功待写入--------{}".format(str(formula_list_item))) # ##########解析 # # try: # # zhanwei_parse,formula_list_parse = formula_extract_lqy(parse) # # except: # # zhanwei_parse = parse # # formula_list_parse = [] # # parse_formula.append(formula_list_parse) # print("*****************开始公式识别*******************") # reg_formula = [list(map(math_symb_reg,ff)) for ff in item_formula] # print("*****************开始断句替换*******************") # # reg_items = [] # yuanjus = [] # zhanwei_duanjus = [] # special_list = [] # # def sub_mode(i): # res = "" # if len(reg_formula[i]) > 0: # res = reg_formula[i][0] # del reg_formula[i][0] # return "【{}】".format(res) # # for i,item in enumerate(zhanwei_items): # zhanwei_duanju = re.compile(r"[,;?,;?..]").split(str(item)) # zhanwei_duanju = [str(i) for i in zhanwei_duanju if i!=""] # # reg_l = [] # yuanju = [str(maple_items_list[i])] # yuanju.extend([" "]*(len(zhanwei_duanju)-1)) # ss = [special_name[i]] * len(zhanwei_duanju) # # for zd in zhanwei_duanju: # reg_item = re.sub(r"#", sub_mode(i), zd) # reg_l.append(reg_item) # if len(yuanju) != len(reg_l): # yuanju.append([" "]*20) # yuanju = yuanju[:len(reg_l)] # # print("原句列表为:{}".format(yuanju)) # # print("占位短句列表为:{}".format(zhanwei_duanju)) # # print("公式类别列表为:{}".format(reg_l)) # reg_items.extend(reg_l) # zhanwei_duanjus.extend(zhanwei_duanju) # yuanjus.extend(yuanju) # special_list.extend(ss) # # data["item_formula"] = item_formula # # data["parse_formula"] = parse_formula # # data["reg_items"] = reg_items # # data.to_excel("lqy_formula_extraction.xlsx") # print(len(yuanjus)) # print(len(reg_items)) # print(len(zhanwei_duanjus)) # print(len(special_list)) # pd.DataFrame({"原句": yuanjus, "公式识别": reg_items, "占位": zhanwei_duanjus,"special":special_list}).to_excel("all_duanju_50_5000.xlsx") # pd.DataFrame({"原句":maple_items_list,"公式识别":reg_formula,"原公式":item_formula,"占位":zhanwei_duanjus,"special":special_list}).to_excel("all_duanju_reg.xlsx")