12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- import json
- import pandas as pd
- keyword2id_dict = dict()
- # 物理量
- excel_path = r"data/物理量.xlsx"
- df = pd.read_excel(excel_path)
- quantity2id = dict()
- count_index = 0
- for i in range(len(df)):
- if not pd.isna(df['类别'][i]):
- count_index += 1
- sign_index = count_index * 100
- knowledge = df['物理量'][i]
- if not pd.isna(knowledge):
- sign_index += 1
- quantity2id[knowledge] = sign_index
- keyword2id_dict["quantity2id"] = quantity2id
- # # 风向标-知识点
- # excel_path = r"data/物理知识点.xlsx"
- # df = pd.read_excel(excel_path)
- # knowledge2id = dict()
- # init_id2max_id = dict()
- # count_index = 0
- # for i in range(len(df)):
- # if not pd.isna(df['2级知识点'][i]):
- # count_index += 1
- # if not pd.isna(df['3级知识点'][i]):
- # sign = df['3级知识点'][i].split(' ')[0].split('.')
- # # sign_index = 10000 + int(sign[0]) * 100 + int(sign[1]) * 10
- # sign_index = 10000 + count_index * 100 + int(sign[1]) * 10
- # init_id = sign_index
- # if init_id not in init_id2max_id:
- # init_id2max_id[init_id] = []
- # else:
- # init_id2max_id[init_id].append(sign_index)
- # knowledge = df['4级知识点'][i]
- # if not pd.isna(knowledge):
- # sign_index += 1
- # knowledge2id[knowledge] = sign_index
- # if init_id not in init_id2max_id:
- # init_id2max_id[init_id] = []
- # else:
- # init_id2max_id[init_id].append(sign_index)
- # keyword2id_dict["knowledge2id"] = knowledge2id
- # keyword2id_dict["init_id2max_id"] = init_id2max_id
- # # 映射转换
- # with open("model_data/fxb_keyword_mapping.json", 'w', encoding="utf8") as f:
- # json.dump(keyword2id_dict, f, ensure_ascii=False, indent=2)
- # 考试院-知识点
- excel_path = r"data/初中物理知识对应关系.xlsx"
- df = pd.read_excel(excel_path)
- knowledge2id = dict()
- init_id2max_id = dict()
- count_index = 0
- for i in range(len(df)):
- if not pd.isna(df.iloc[i][2]):
- count_index += 1
- sign_index = 100000000 + count_index * 1000000
- if pd.isna(df.iloc[i+1][3]):
- knowledge = df.iloc[i][2].split(' ')[1]
- knowledge2id[knowledge] = sign_index
- continue
- if not pd.isna(df.iloc[i][3]):
- sign_index = int(str(sign_index)[:-4]) * 10000
- sign_index += 10000
- relate_index = sign_index
- init_id2max_id[relate_index] = []
- if pd.isna(df.iloc[i+1][4]):
- knowledge = df.iloc[i][3].split(' ')[1]
- knowledge2id[knowledge] = sign_index
- continue
- if not pd.isna(df.iloc[i][4]):
- sign_index = int(str(sign_index)[:-2]) * 100
- sign_index += 100
- if pd.isna(df.iloc[i+1][5]):
- knowledge = df.iloc[i][4].split(' ')[1]
- knowledge2id[knowledge] = sign_index
- init_id2max_id[relate_index].append(sign_index)
- continue
- if not pd.isna(df.iloc[i][5]):
- sign_index += 1
- knowledge = df.iloc[i][5].split(' ')[1]
- knowledge2id[knowledge] = sign_index
- init_id2max_id[relate_index].append(sign_index)
- keyword2id_dict["knowledge2id"] = knowledge2id
- keyword2id_dict["init_id2max_id"] = init_id2max_id
- # 映射转换
- with open("model_data/ksy_keyword_mapping.json", 'w', encoding="utf8") as f:
- json.dump(keyword2id_dict, f, ensure_ascii=False, indent=2)
|