|
@@ -1,5 +1,4 @@
|
|
|
import json
|
|
|
-import numpy as np
|
|
|
import pandas as pd
|
|
|
|
|
|
keyword2id_dict = dict()
|
|
@@ -19,90 +18,75 @@ for i in range(len(df)):
|
|
|
quantity2id[knowledge] = sign_index
|
|
|
keyword2id_dict["quantity2id"] = quantity2id
|
|
|
|
|
|
-# # 物理场景
|
|
|
-# excel_path = r"data/物理情景.xlsx"
|
|
|
+# # 风向标-知识点
|
|
|
+# excel_path = r"data/物理知识点.xlsx"
|
|
|
# df = pd.read_excel(excel_path)
|
|
|
-# scene2id = dict()
|
|
|
+# knowledge2id = dict()
|
|
|
+# init_id2max_id = dict()
|
|
|
# count_index = 0
|
|
|
# for i in range(len(df)):
|
|
|
-# if not pd.isna(df['知识点'][i]):
|
|
|
+# if not pd.isna(df['2级知识点'][i]):
|
|
|
# count_index += 1
|
|
|
-# sign_index = 10000 + count_index * 10
|
|
|
-# knowledge = df['情景'][i]
|
|
|
+# if not pd.isna(df['3级知识点'][i]):
|
|
|
+# sign = df['3级知识点'][i].split(' ')[0].split('.')
|
|
|
+# # sign_index = 10000 + int(sign[0]) * 100 + int(sign[1]) * 10
|
|
|
+# sign_index = 10000 + count_index * 100 + int(sign[1]) * 10
|
|
|
+# init_id = sign_index
|
|
|
+# if init_id not in init_id2max_id:
|
|
|
+# init_id2max_id[init_id] = []
|
|
|
+# else:
|
|
|
+# init_id2max_id[init_id].append(sign_index)
|
|
|
+# knowledge = df['4级知识点'][i]
|
|
|
# if not pd.isna(knowledge):
|
|
|
# sign_index += 1
|
|
|
-# scene2id[knowledge] = sign_index
|
|
|
-# keyword2id_dict["scene2id"] = scene2id
|
|
|
+# knowledge2id[knowledge] = sign_index
|
|
|
+# if init_id not in init_id2max_id:
|
|
|
+# init_id2max_id[init_id] = []
|
|
|
+# else:
|
|
|
+# init_id2max_id[init_id].append(sign_index)
|
|
|
+# keyword2id_dict["knowledge2id"] = knowledge2id
|
|
|
+# keyword2id_dict["init_id2max_id"] = init_id2max_id
|
|
|
|
|
|
-# 风向标-知识点
|
|
|
-excel_path = r"data/物理知识点.xlsx"
|
|
|
+# 考试院-知识点
|
|
|
+excel_path = r"data/初中物理知识对应关系.xlsx"
|
|
|
df = pd.read_excel(excel_path)
|
|
|
knowledge2id = dict()
|
|
|
init_id2max_id = dict()
|
|
|
count_index = 0
|
|
|
for i in range(len(df)):
|
|
|
- if not pd.isna(df['2级知识点'][i]):
|
|
|
+ if not pd.isna(df.iloc[i][2]):
|
|
|
count_index += 1
|
|
|
- if not pd.isna(df['3级知识点'][i]):
|
|
|
- sign = df['3级知识点'][i].split(' ')[0].split('.')
|
|
|
- # sign_index = 10000 + int(sign[0]) * 100 + int(sign[1]) * 10
|
|
|
- sign_index = 10000 + count_index * 100 + int(sign[1]) * 10
|
|
|
- init_id = sign_index
|
|
|
- if init_id not in init_id2max_id:
|
|
|
- init_id2max_id[init_id] = []
|
|
|
- else:
|
|
|
- init_id2max_id[init_id].append(sign_index)
|
|
|
- knowledge = df['4级知识点'][i]
|
|
|
- if not pd.isna(knowledge):
|
|
|
+ sign_index = 100000000 + count_index * 1000000
|
|
|
+ if pd.isna(df.iloc[i+1][3]):
|
|
|
+ knowledge = df.iloc[i][2].split(' ')[1]
|
|
|
+ knowledge2id[knowledge] = sign_index
|
|
|
+ continue
|
|
|
+ if not pd.isna(df.iloc[i][3]):
|
|
|
+ sign_index = int(str(sign_index)[:-4]) * 10000
|
|
|
+ sign_index += 10000
|
|
|
+ relate_index = sign_index
|
|
|
+ init_id2max_id[relate_index] = []
|
|
|
+ if pd.isna(df.iloc[i+1][4]):
|
|
|
+ knowledge = df.iloc[i][3].split(' ')[1]
|
|
|
+ knowledge2id[knowledge] = sign_index
|
|
|
+ continue
|
|
|
+ if not pd.isna(df.iloc[i][4]):
|
|
|
+ sign_index = int(str(sign_index)[:-2]) * 100
|
|
|
+ sign_index += 100
|
|
|
+ if pd.isna(df.iloc[i+1][5]):
|
|
|
+ knowledge = df.iloc[i][4].split(' ')[1]
|
|
|
+ knowledge2id[knowledge] = sign_index
|
|
|
+ init_id2max_id[relate_index].append(sign_index)
|
|
|
+ continue
|
|
|
+ if not pd.isna(df.iloc[i][5]):
|
|
|
sign_index += 1
|
|
|
+ knowledge = df.iloc[i][5].split(' ')[1]
|
|
|
knowledge2id[knowledge] = sign_index
|
|
|
- if init_id not in init_id2max_id:
|
|
|
- init_id2max_id[init_id] = []
|
|
|
- else:
|
|
|
- init_id2max_id[init_id].append(sign_index)
|
|
|
+ init_id2max_id[relate_index].append(sign_index)
|
|
|
+
|
|
|
keyword2id_dict["knowledge2id"] = knowledge2id
|
|
|
keyword2id_dict["init_id2max_id"] = init_id2max_id
|
|
|
|
|
|
-# # 考试院-知识点
|
|
|
-# excel_path = r"data/初中物理知识对应关系.xlsx"
|
|
|
-# df = pd.read_excel(excel_path)
|
|
|
-# knowledge2id = dict()
|
|
|
-# init_id2max_id = dict()
|
|
|
-# count_index = 0
|
|
|
-# for i in range(len(df)):
|
|
|
-# if not pd.isna(df.iloc[i][2]):
|
|
|
-# count_index += 1
|
|
|
-# sign_index = 100000000 + count_index * 1000000
|
|
|
-# if pd.isna(df.iloc[i+1][3]):
|
|
|
-# knowledge = df.iloc[i][2].split(' ')[1]
|
|
|
-# knowledge2id[knowledge] = sign_index
|
|
|
-# continue
|
|
|
-# if not pd.isna(df.iloc[i][3]):
|
|
|
-# sign_index = int(str(sign_index)[:-4]) * 10000
|
|
|
-# sign_index += 10000
|
|
|
-# relate_index = sign_index
|
|
|
-# init_id2max_id[relate_index] = []
|
|
|
-# if pd.isna(df.iloc[i+1][4]):
|
|
|
-# knowledge = df.iloc[i][3].split(' ')[1]
|
|
|
-# knowledge2id[knowledge] = sign_index
|
|
|
-# continue
|
|
|
-# if not pd.isna(df.iloc[i][4]):
|
|
|
-# sign_index = int(str(sign_index)[:-2]) * 100
|
|
|
-# sign_index += 100
|
|
|
-# if pd.isna(df.iloc[i+1][5]):
|
|
|
-# knowledge = df.iloc[i][4].split(' ')[1]
|
|
|
-# knowledge2id[knowledge] = sign_index
|
|
|
-# init_id2max_id[relate_index].append(sign_index)
|
|
|
-# continue
|
|
|
-# if not pd.isna(df.iloc[i][5]):
|
|
|
-# sign_index += 1
|
|
|
-# knowledge = df.iloc[i][5].split(' ')[1]
|
|
|
-# knowledge2id[knowledge] = sign_index
|
|
|
-# init_id2max_id[relate_index].append(sign_index)
|
|
|
-
|
|
|
-# keyword2id_dict["knowledge2id"] = knowledge2id
|
|
|
-# keyword2id_dict["init_id2max_id"] = init_id2max_id
|
|
|
-
|
|
|
# 映射转换
|
|
|
with open("model_data/keyword_mapping.json", 'w', encoding="utf8") as f:
|
|
|
json.dump(keyword2id_dict, f, ensure_ascii=False, indent=2)
|