#!/usr/bin/env/python
# -*- coding:utf-8 -*-
""""
公式提取及整合==>加$$
"""
import re
def get_equation_str(item):
"""
1.存在公式字符串与选项一起的情况,此时,先提取选项,再公式比较好
2.先把题号去掉
3.对逗号进一步判断是否继续拆
4.故选:A-F 最好不要划分为公式
:param item:
:return:
"""
item = re.sub(r">", '>', str(item))
item = re.sub(r"<", '<', item)
pattern1 = re.compile(
r"([\da-zA-Z!\"'(*+\-\[{|#%\\><≤≥≡≦≧+-≈≠﹢﹣±㏒㏑∑∏><==×÷(△∆⊙⌒∈∩∉∪⊕∥∣≌∽∧⊥∫∬∮∯∅≮≯∁Ω→°•"
r"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨαβγδεζνξοπρσηθικλτυφχψωμGgmst]"
r"([\da-zA-Z!\"&'()*+,,\-../:;?\[\]{}|#%~^_`、、\\><≤≥≡≦≧+-≈≠﹢﹣±㏒㏑∑∏><==×÷()△∆⊙⌒∈∩∉∪⊕∥∣≌∽∧⊥∫∬∮∯∅≮≯∁Ω→°•′~·"
r"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨαβγδεζνξοπρσηθικλτυφχψωμGgmst/∶∝]"
r"|\s|\n)+)")
pattern2 = re.compile("([!\"&'()*+,,\-../:;?\[\]{}|#%~^_`、、]|\s|\n)")
if re.search("\s*[\u4e00-\u9fa5][\u4e00-\u9fa5、、\s]*?|\\text\s*{\s*[\u4e00-\u9fa5][\u4e00-\u9fa5、、\s]*?\}", str(item)):
pattern1 = re.compile(
r"([\da-zA-Z!\"'(*+\-\[{|#%\\><≤≥≡≦≧+-≈≠﹢﹣±㏒㏑∑∏><==×÷(△∆⊙⌒∈∩∉∪⊕∥∣≌∽∧⊥∫∬∮∯∅≮≯∁Ω→°•"
r"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨαβγδεζνξοπρσηθικλτυφχψωμGgmst]"
r"(\s*[\u4e00-\u9fa5][\u4e00-\u9fa5、、\s]*?" # add
r"|[\da-zA-Z!\"&'()*+,,\-../:;?\[\]{}|#%~^_`、、\\><≤≥≡≦≧+-≈≠﹢﹣±㏒㏑∑∏><==×÷()△∆⊙⌒∈∩∉∪⊕∥∣≌∽∧⊥∫∬∮∯∅≮≯∁Ω→°•′~·"
r"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨαβγδεζνξοπρσηθικλτυφχψωμGgmst/∶∝]"
r"|\s|\n)+)")
all_equa = [y[0] for y in re.findall(pattern1, str(item))]
print("all_equa:", all_equa)
right_equa = []
for i in all_equa:
if re.sub(pattern2, "", i):
i = re.sub("^([1-9]|[1-5][0-9])\s*[、、]|^([1-9]|[1-5][0-9])\s*[..]$|^[))]?\s*[((][\s\d]+[))]", "", i)
i = re.sub("^([1-9]|[1-5][0-9])\s*[、、..]\s*([((]\d*)?$", "", i)
i = re.sub("^([1-9]|[1-5][0-9])\s*[、、..]\s*[((][\s\d]+[))]", "", i)
i = re.sub("[(\[{、、..,,(∶]+$", "", i)
i = re.sub("^https?://.+?$|^www\..+?$", "", i)
i = re.sub("^[))]?\s*[((][\s\w]*[))]", "", i)
i = re.sub("^\d+[))]|^[((]\d+$", "", i)
i = re.sub("(\u3000)+\s*$|^\s*(\u3000)+", "", i)
i = re.sub("\u3000", " ", i)
i = re.sub(r"^\s*([1-9]|[1-5][0-9])\s*[..、、]\s*[A-Z]{1,3}\s*$", "", i)
i = re.sub(r"^\s*([1-9]|[1-5][0-9])\s*[..、、]\s*([^\d])", r"\2", i)
i = re.sub(r"^\s*[a-z][--][a-z]\s*$", '', i)
i = re.sub(r"^\s*<[a-z]+/?>\s*$", '', i)
i = re.sub(r"^([^((]+?)[))]$|^([^{]+?)\}$|^([^\[]+?)\]$", r'\1', i)
i = re.sub(r"^[((]([^))]+?$)|^{([^\}]+?$)|^\[([^\]]+?$)", r'\1', i)
def deal(yy):
return yy.group(1).replace(",", "、").replace(",", "、")
i = re.sub(r"^(([A-Z]\s*[,,])+[A-Z])$", deal, i)
i = re.sub(r"^(\d+|\d+·\d+)$", "", i)
i = re.sub(r"^[A-E]\s*[、、..,,]\s*[A-Za-z\d]$", "", i)
i = re.sub(r"[,,]\s*[A-F]$", "", i)
i = i.replace("G", "G").replace("g", "g").replace("m", "m").replace("s", "s").replace("/", "/").replace("=", "=")\
.replace("2", "2").replace("F", "F").replace("t", "t").strip()
if len(i) > 1:
right_equa.append(i)
return right_equa
def get_equation_instr(item):
"""
先结构化,再提取公式渲染比较好!!!# 公式字符串提取再补上$$
:param item:
:return:
"""
# print('///////////////原始:', item)
item = item.replace("(", "(").replace(")", ")")
item = re.sub(r"图(\d{1,2}[--]\d{1,2})", "图&"+r"\1", item)
item = re.sub(r">", '>', str(item))
item = re.sub(r"<", '<', item)
item = re.sub(r"\\lt br/?>", '
', item)
item = re.sub(r"\s{2,}()", r'\1', item)
item = re.sub(r"()\s{2,}", r'\1', item)
# pattern1 = re.compile(r"\s*[\u4e00-\u9fa5][\u4e00-\u9fa5、、\s]*?|\\text\s*{\s*[\u4e00-\u9fa5][\u4e00-\u9fa5、、\s]*?\}")
pattern1 = re.compile("([^\u4e00-\u9fa5;,、:①②③④\s”《》【】“))\]\}](\s*[\u4e00-\u9fa5][\u4e00-\u9fa5、、\s]*?"
"|\\text\s*{\s*[\u4e00-\u9fa5][\u4e00-\u9fa5、、\s]*?\}|[_^{]\s*[\u4e00-\u9fa5][\u4e00-\u9fa5、、\s]*?"
"|[^\u4e00-\u9fa5;“”。?①②③④]|\n)*)") # 不以xxx开头,不包含xxx
all_equa = [y[0] for y in re.findall(pattern1, str(item))]
# -------------按某些格式(如图片)将初步提取的公式进一步拆分-------------------
new_all_equa = []
for equa in all_equa:
table_equa = re.findall("(.*?) | ", equa) # 表格内的公式
if table_equa:
for ta in table_equa: # 不包含图片,data-latex中肯定含有$
new_all_equa.extend([e for e in re.split(r'|', ta) if e])
else:
equa = re.split(r'||\.{7,}|=__{2,}' #
r'|
\s*[((]?\s*[1-9]\s*[))]\s*(?![+\-*/])|\s{2,}(?![+\-*/])|[((]\s*[123456]\s*[))]', equa) # ①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳
new_all_equa.extend([e for e in equa if e])
print("初步提取new_all_equa:", all_equa)
# --------------将需要更改的公式提取出来,映射----------------------------------
right_equa = {}
for i in new_all_equa:
raw_eqn = i
i = re.sub(r'', "", i)
# 只是img公式的,不需要
if not re.sub("[((]\s*\d{1,2}\s*[))]|[${}()()。①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳【】]"
"|?t[hdr]>|
|?table>|?html>|?t?body>|?p>", "", i).strip():
continue
# 简单公式,清洗一下
# if re.match(r"\d{1,4}[.,;]*$|[A-Z]{1,2}[.,;]*$|[a-z]{1,2}[.,;]*$|(\\rm|\\mathrm)\s*[A-Z]{1,3}[\s.,;]*$"
# r"|(\\rm|\\mathrm)\s*[A-Z][\s.,;]*[A-Z][\s.,;]*$|[A-G]\s*[、,,]\s*[A-G]$|[()\[\]{}\s\\,;.!…\d]+$",
# re.sub("[${}()()]", "", i).strip()):
# # i = re.sub(r"[${}]|\\rm|\\mathrm", "", raw_eqn).strip()
# # i = re.sub(r"\\,", " ", i)
# # if i != raw_eqn.strip():
# # item = item.replace(raw_eqn, i, 1)
# right_equa[raw_eqn] = "${}$".format(i)
# print("{}----变1----${}$".format(raw_eqn, i))
# else:
pattern2 = re.compile("([!\"“”&'()()*+,,\-..。/::;?\[\]{}|#%~^_`、、]|\s|\n|\u3000)")
if re.sub(pattern2, "", i):
i = re.sub(r'', r"\1", raw_eqn)
# 结合最开始的替换操作
i = re.sub(r"^&.+?$", "", i)
i = re.sub(r"^\.+", "", i)
i = re.sub("^([!&),,..::;。?\]}#%、、]|\s|\n)", "", i)
i = re.sub("^([1-9]|[1-5][0-9])\s*[、、]|^([1-9]|[1-5][0-9])\s*[..]$|^[))]?\s*[((][\s\d]+[))]", "", i)
i = re.sub("^([1-9]|[1-5][0-9])\s*[、、..]\s*([((]\d*)?$", "", i)
i = re.sub("^([1-9]|[1-5][0-9])\s*[、、..]\s*[((][\s\d]+[))]", "", i)
i = re.sub("^https?://.+?$|^www\..+?$", "", i)
i = re.sub("^[))]?\s*[((][\s\w]*[))]", "", i)
i = re.sub("^([^((]*?)[))]", r"\1", i)
i = re.sub("^\d+[))]|^[((]\d+$", "", i)
i = re.sub("(\u3000)+\s*$|^\s*(\u3000)+", "", i)
i = re.sub(r"^\s*([1-9]|[1-5][0-9])\s*[..、、]\s*[A-Z]{1,3}\s*$", "", i)
i = re.sub(r"^\s*([1-9]|[1-5][0-9])\s*[..、、]\s*([^\d])", r"\2", i)
# i = re.sub(r"^\s*[a-z][--][a-z]\s*$", '', i) # 'v-t', 'x-t'
i = re.sub(r"^\s*\d{1,2}[--]\d{1,2}\s*$", '', i)
i = re.sub(r"^\s*<[a-z]+/?>\s*$", '', i)
i = re.sub(r"^([^((]+?)[))]$|^([^{]+?)\}$|^([^\[]+?)\]$", r'\1\2\3', i)
i = re.sub(r"^[((]([^))]+?$)|^{([^\}]+?$)|^\[([^\]]+?$)", r'\1\2\3', i)
i = re.sub(r"[,,]\s*[A-F]$", "", i)
i = re.sub(r"[(\[{、、..,,(∶”?】【]+$|(
)+$", "", i)
i = re.sub(r"=[\s_]*?$", "", i)
i = re.sub(r"^__[\s_]*?([^\s_]+?)$", r"\1", i)
i = re.sub(r"(
|\n)+?\s*$|^\s*(
|\n)+?", "", i)
i = re.sub("^[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]*?([^①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]+?)"
"[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]*?$", r"\1", i.strip())
raw_eqn = i
i = re.sub("\u3000", " ", i)
# def deal(yy):
# return yy.group(1).replace(",", "、").replace(",", "、")
# i = re.sub(r"^(([A-Z]\s*[,,])+[A-Z])$", deal, i)
# # 不渲染
# i = re.sub(r"^(([A-Z]\s*[,,、、])+[A-Z])$", "", i) # A、B、C、D 不渲染
# i = re.sub(r"^[A-Za-z]{2}$", "", i) # 不渲染
# i = re.sub(r"^(\d+|\d+·\d+)$", "", i)
# i = re.sub(r"^[A-E]\s*[、、..,,]\s*[A-Za-z\d]$", "", i)
sub_dd = {
"F": "F",
"G": "G",
"g": "g",
"m": "m",
"N": "N",
"s": "s",
"t": "t",
"/": "/",
"=": "=",
"-": "-",
"2": "2",
"′": "'",
# "°": "^\ciro", # \\cir后面可以随便加字母或数字
}
i = re.sub("|".join(sub_dd.keys()), lambda x: sub_dd[x.group()], i)
# i = i.replace("G", "G").replace("g", "g").replace("m", "m").replace("s", "s").replace("/", "/").replace("=", "=") \
# .replace("2", "2").replace("F", "F").replace("t", "t").replace("N","N").replace("-", "-").strip()
# 个别不符latex格式的公式 处理
def deal2(yy):
new_y = yy.group(2)
if yy.group(1) == "":
new_y = "_{" + yy.group(2) + "}"
if yy.group(1) == "":
new_y = "^{" + yy.group(2) + "}"
return re.sub("([\u4e00-\u9fa5]+)", r"\\text{\1}", new_y)
# i = re.sub("()(.+?)", deal2, i)
# i = re.sub("()(.+?)", deal2, i).strip()
recur_n = 0
while re.search(r"?su[bp]>", i):
recur_n += 1
i = re.sub("()(((?!?sub>).)+?)", deal2, i)
i = re.sub("()(((?!?sup>).)+?)", deal2, i)
if recur_n > 5:
break
i = re.sub(r"\^{(.*?)\}\^{(.*?)\}", r"^{\1^{\2}}", i)
if len(i) > 0: # 单个字母或数字也渲染
new_eq = re.sub("^\$(\s+)(.+?)(?right_equa:",right_equa)
# 图片替换
all_image = re.findall(r'', item)
subs2src = {}
for j, img in enumerate(all_image):
item = item.replace(img, "【&img{}&】".format(j))
subs2src["【&img{}&】".format(j)] = img
temp_map = {}
for n, eq in enumerate(right_equa):
if eq[0] not in ["e1", "e2","e3", "e4", "e5", "e6", "e7", "e8", "e9"]:
if eq[0] not in "http://zxhx-1302712961.cos.ap-shanghai.myqcloud.com/zyk/uploadfiles/wording/new_image59.png"\
or len(eq[0]) < 3:
if re.search("^(\d+|[img]+)$", eq[0]) is None:
item = item.replace(eq[0], "#&{}".format(n))
# else:
# item = re.sub("(?\s*|^\n*)([\dA-Za-z][A-Za-z\d\s]*)([\u4e00-\u9fa5]|
"""
tt2 = 'Ag+、K+、NO、C1-'
tt3 = 'Mg2+、Na+、Cl-、SO'
tt4 = 'NH、Cu2+、OH一、Cl—'
tt5 = 'H+、Na+、HCO、SO'
tt6 = '向MgSO4和Al2(SO4)3的混合溶液中,逐滴加入NaOH溶液。下列图像中, 能正确表示上述反应的是(横坐标表示加入NaOH溶液的体积,纵坐标表示反应生成沉淀的质量) ( )
【未识别图片】'
tt7 = '把一定量铁粉放入氯化铁溶液中,完全反应后,所得溶液中Fe和Fe的浓度恰好相等.则已反应的Fe和未反应的Fe的物质的量之比为( )'
tt8 = '下列物质中既能跟稀H2SO4反应,又能跟氢氧化钠溶液反应的是( )。
①NaHCO3 ②Al2O3 ③Al(OH)3 ④Al'
tt9 = 'M2O 7 x-与S2- 在酸性溶液中发生如下反应:M2O7x-+3S2-+14H+=2M3++3S↓+7H2O
则M2O7x- 中M 的化合价为( )'
tt10 = 'AlCl3溶液中加入足量的氨水:Al+3OH═Al(OH)3↓'
tt11 = 'FeCl2溶液跟Cl2反应:Fe2++Cl2=Fe3++2Cl-'
tt12 = '实验室中需要配制2 mol/L的NaCl溶液950 mL,配制时应选用的容量瓶的规格和称取的NaCl的质量分别是( )'
tt13 = '现有下列10种物质:①H2O ②空气 ③Al ④明矾 ⑤H2SO4 ⑥烧碱 ⑦CuSO4•5H2O ⑧碘酒 ⑨C2H5OH ⑩ 纯碱(1)属于碱的是 ;属于盐的是 ;属于电解质的是 (填写序号)。
(2)④明矾的化学式为 ,向④的溶液中滴加过量⑥溶液反应离子方程式为
(3)⑤的溶液与⑩的溶液反应离子方程式为'
tt14 = '(17分)(1)Fe2O3+6H+═2Fe3++3H2O;
(2)NaOH溶液;
(3)Fe+2FeCl3═3FeCl2;
(4)4Fe(OH)2+O2+2H2O═4Fe(OH)3;(5)氢氧化铁胶体 粒子直径大小 生成白色沉淀,迅速变成灰绿色,最终变成红褐色;
1718.17(1)题中通入CO2后所得溶液的溶质成分为Na2CO3和NaHCO3,若将所得溶质在低温低压条件下蒸干,所得固体物质(不带结晶水)的质量为13.7g.
(2)原NaOH溶液的物质的量浓度为2mol/L;