# File:get_sheet_points_by_nlp.py # Author:lynn # Date:2020/5/19 18:23 import ast, cv2, re from segment.sheet_resolve.analysis.sheet.tag_parse import TagParse from segment.sheet_resolve.tools.utils import crop_region try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET def decide_coordinate_full_contains(coordinate1, coordinate2): xmin1 = coordinate1[0] ymin1 = coordinate1[1] xmax1 = coordinate1[2] ymax1 = coordinate1[3] mid_x = int(xmin1 + (xmax1 - xmin1)//2) mid_y = int(ymin1 + (ymax1 - ymin1)//2) xmin2 = coordinate2[0] ymin2 = coordinate2[1] xmax2 = coordinate2[2] ymax2 = coordinate2[3] if xmin1 <= xmin2 and ymin1 <= ymin2 and xmax1 >= xmax2 and ymax1 >= ymax2: return True else: return False def analyse_solve_solve0_result(result, ele): key_words = ['T', 'C', 'S', 'N', 'O', 'E'] key_words_list = [ele[1] for ele in result] move_m_list = [ele for ele in key_words_list if ele != 'M'] new_list = [] if 'O' not in key_words_list: if 'N' in move_m_list and 'T' in move_m_list: index_n = key_words_list.index('N') index_t = key_words_list.index('T') numeber_str = result[index_n][0] number_pattern = re.findall('\d+', numeber_str) title_number = int(number_pattern[0]) ele['number'] = title_number total_score_str = result[index_t][0] total_score_pattern = re.findall('\d+', total_score_str) total_score = int(total_score_pattern[0]) ele['default_points'] = total_score new_list.append(ele) elif 'N' not in move_m_list and 'T' in move_m_list: index_t = key_words_list.index('T') total_score_str = result[index_t][0] total_score_pattern = re.findall('\d+', total_score_str) total_score = int(total_score_pattern[0]) ele['default_points'] = total_score new_list.append(ele) elif 'N' in move_m_list and 'T' not in move_m_list: index_n = key_words_list.index('N') numeber_str = result[index_n][0] number_pattern = re.findall('\d+', numeber_str) title_number = int(number_pattern[0]) ele['number'] = title_number new_list.append(ele) return new_list def analyse_choice_result(result, choice_m_list): key_words_list = [ele[1] for ele in result] move_m_list = [ele for ele in key_words_list if ele != 'M'] o_len = [ele for ele in key_words_list if ele == 'O'] if 'O' in key_words_list: index_o = [index for index, ele in enumerate(key_words_list) if ele == 'O'] split_0_index = index_o split_0_index.insert(-1, len(key_words_list)) split_0_index = sorted(list(set(split_0_index))) split_by_o_list = [] for index, ele in enumerate(split_0_index): if index == 0: one_part = result[0: (split_0_index[index + 1]) - 1] split_by_o_list.append(one_part) elif ele == len(key_words_list): break else: one_part = result[split_0_index[index]: (split_0_index[index + 1]) - 1] split_by_o_list.append(one_part) print(split_by_o_list) number_with_value = [] number_with_value1 = [] for index0, ele0 in enumerate(split_by_o_list): part_key_words = [ele[1] for ele in ele0] index_oo = part_key_words.index('O') index_ss = part_key_words.index('S') contiue_number0 = ele0[index_oo] number_list = [] if '-' in contiue_number0[0]: number_list = contiue_number0[0].split('-') elif '~' in contiue_number0[0]: number_list = contiue_number0[0].split('~') number_list = [int(ele) for ele in number_list] number_list_all = [i for i in range(number_list[0], number_list[1] + 1)] value = ele0[index_ss][0] nlp_number_value_dict = {} nlp_number_value_dict['number_list'] = number_list_all nlp_number_value_dict['value'] = value number_with_value1.append(nlp_number_value_dict) for ele in number_list_all: number_with_value.append({ele: value}) print(number_with_value) for nlp_number in number_with_value1: number_list_nlp0 = nlp_number['number_list'] value_nlp = nlp_number['value'] for choice_m_box in choice_m_list: number_list_raw0 = choice_m_box['number'] decide_whether_inclue = [False for c in number_list_raw0 if c not in number_list_nlp0] count_of_False = decide_whether_inclue.count(False) rows = choice_m_box['rows'] if count_of_False / rows > 0.8: continue elif count_of_False / rows <= 0.4: points_list = [float(value_nlp) for i in range(0, rows)] choice_m_box['default_points'] = points_list return choice_m_list def analyse_cloze_result(result, cloze_and_cloze_s_list): new_list = [] for cloze_and_cloze_s_ele in cloze_and_cloze_s_list: cloze_s_info = cloze_and_cloze_s_ele['cloze_s_info'] key_words_list = [ele[1] for ele in result] move_m_list = [ele for ele in key_words_list if ele != 'M'] for cloze_s_ele in cloze_s_info: if 'S' in move_m_list and 'T' in move_m_list: index_n = key_words_list.index('S') value_str = result[index_n][0] value_pattern = re.findall('\d+', value_str) value = int(value_pattern[0]) cloze_s_ele['number'] = value elif 'C' not in move_m_list and 'T' in move_m_list: index_t = key_words_list.index('T') index_c = key_words_list.index('C') total_score_str = result[index_t][0] total_score_pattern = re.findall('\d+', total_score_str) total_score = int(total_score_pattern[0]) value_per = float(total_score / int(index_c)) cloze_s_ele['default_points'] = value_per new_list.append(cloze_s_ele) return new_list def analyse_cloze_result1(result, cloze_and_cloze_s_list): new_list = [] for cloze_and_cloze_s_ele in cloze_and_cloze_s_list: cloze_s_info = cloze_and_cloze_s_ele['cloze_s_info'] key_words_list = [ele[1] for ele in result] move_m_list = [ele for ele in key_words_list if ele != 'M'] for cloze_s_ele in cloze_s_info: if 'S' in move_m_list and 'T' in move_m_list: index_n = key_words_list.index('S') value_str = result[index_n][0] value_pattern = re.findall('\d+', value_str) value = int(value_pattern[0]) cloze_s_ele['number'] = value elif 'C' not in move_m_list and 'T' in move_m_list: index_t = key_words_list.index('T') index_c = key_words_list.index('C') total_score_str = result[index_t][0] total_score_pattern = re.findall('\d+', total_score_str) total_score = int(total_score_pattern[0]) value_per = float(total_score / int(index_c)) cloze_s_ele['default_points'] = value_per new_list.append(cloze_s_ele) return new_list def get_sheet_points_by_nlp(sheet_dict): import CRFPP # json_path = r'C:\Users\Administrator\Desktop\type_score_nlp\type_score_info\example\english\33.json' # file = open(json_path, 'r', encoding='gbk').read() # json_file = ast.literal_eval(file) regions = sheet_dict['regions'] ocr_list = [] new_list = [] choice_m_list = [ele for ele in regions if ele['class_name'] == 'choice_m'] cloze_list = [ele for ele in regions if ele['class_name'] == 'cloze'] cloze_and_cloze_s_list = [] for element_cloze in cloze_list: cloze_box = element_cloze['bounding_box'] cloze_bbox = [cloze_box['xmin'], cloze_box['ymin'], cloze_box['xmax'], cloze_box['ymax']] cloze_s_dict = {} cloze_s_list = [] for element in regions: if element['class_name'] == 'cloze_s': cloze_s_box = element['bounding_box'] cloze_s_bbox = [cloze_s_box['xmin'], cloze_s_box['ymin'], cloze_s_box['xmax'], cloze_s_box['ymax']] if decide_coordinate_full_contains(cloze_bbox, cloze_s_bbox) == True: cloze_s_list.append(element) cloze_s_dict['cloze_info'] = element_cloze cloze_s_dict['cloze_s_info'] = cloze_s_list cloze_and_cloze_s_list.append(cloze_s_dict) print(cloze_and_cloze_s_list) for ele in regions: if 'type_score_ocr' in ele: ocr_list.append(ele) for index, ele in enumerate(ocr_list): ocr_content = ele['type_score_ocr'] taggers = CRFPP.Tagger("-m " + './segment/sheet_resolve/model/nlp_model/crf2.model') tb = TagParse(taggers) result = tb.get_tag_val(ocr_content) if ele['class_name'] == 'cloze': print(ele) cloze_and_cloze_s_list0 = [] for ele1 in cloze_and_cloze_s_list: print(ele1) ele1_cloze = [ele1['cloze_info']['bounding_box']['xmin'], ele1['cloze_info']['bounding_box']['ymin'], ele1['cloze_info']['bounding_box']['xmax'], ele1['cloze_info']['bounding_box']['ymax']] ele_cloze = [ele['bounding_box']['xmin'], ele['bounding_box']['ymin'], ele['bounding_box']['xmax'], ele['bounding_box']['ymax']] if ele1_cloze == ele_cloze: cloze_and_cloze_s_list0.append(ele1) new_list = analyse_cloze_result(result, cloze_and_cloze_s_list0) elif ele['class_name'] == 'choice': new_list = analyse_choice_result(result, choice_m_list) else: new_list = analyse_solve_solve0_result(result, ele) for index0, ele0 in enumerate(regions): for index1, ele1 in enumerate(new_list): class_name0 = ele0['class_name'] bounding_box0 = ele0['bounding_box'] class_name1 = ele1['class_name'] bounding_box1 = ele1['bounding_box'] if class_name0 == class_name1 and bounding_box0 == bounding_box1: ele0['default_points'] = ele1['default_points'] # pop type_score_ocr for ele in regions: if 'type_score_ocr' in ele: ele.pop('type_score_ocr') sheet_dict.update({'regions': regions}) return sheet_dict