Просмотр исходного кода

1.选择题补全中解决单行问题;
2.修复考号补全中的非空bug,[{}];
3.题号分数识别增加nlp模型;
4.删除tr-ocr;
5.修复选择题号re bug.

lighttxu 4 лет назад
Родитель
Сommit
aa5a5cc974

+ 14 - 7
segment/sheet_resolve/analysis/choice/choice_line_box.py

@@ -3,7 +3,9 @@
 # @Time    : 2018/11/22 0022 下午 16:01
 import time
 import re
-import cv2, os
+import cv2
+import os
+import random
 import traceback
 import numpy as np
 import xml.etree.cElementTree as ET
@@ -376,7 +378,7 @@ def choice_bbox_vague(choice_m_list0, x_y_interval_ave, singe_box_width_height_a
             y_diff = x_y_interval_ave[1]
             s_height = singe_box_width_height_ave[1]
             choice_bbox = (np.hstack((np.array([min(xmin0), min(ymin0) - y_diff - 3 * s_height]), np.array([max(xmax0), max(ymax0)])))).tolist()
-            choice_bbox_with_index_list = (choice_bbox, choice_m_list[1])
+            choice_bbox_with_index_list = (choice_bbox, choice_m_list1[1])
             choice_bbox_all.append(choice_bbox_with_index_list)
     return choice_bbox_all
 
@@ -432,7 +434,9 @@ def choice_m_row_col(image, choice_m_bbox_list, xml_path):
     a_z = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
     choice_m_dict_list = []
 
-    choice_m_for_dircetion = utils.crop_region(image, choice_m_bbox_list[0]['bounding_box'])
+    # 或在长宽比接近的choice_m中选取
+    random_one = random.randint(0, len(choice_m_bbox_list)-1)
+    choice_m_for_dircetion = utils.crop_region(image, choice_m_bbox_list[random_one]['bounding_box'])
     res_dict = get_ocr_text_and_coordinate(choice_m_for_dircetion, ocr_accuracy='accurate', language_type='ENG')
     direction = get_direction(res_dict)
     for index0, box in enumerate(choice_m_bbox_list):  # rcnn识别的框匹配题号
@@ -441,7 +445,7 @@ def choice_m_row_col(image, choice_m_bbox_list, xml_path):
         # box_coordiante = (m_left, m_top, box['xmax'], box['ymax'])
         single_choice_m = utils.crop_region(image, box)
         try:
-            row_col_dict = get_choice_m_row_and_col(m_left, m_top, single_choice_m)     # 所有的小框, 行列等
+            row_col_dict = get_choice_m_row_and_col(m_left, m_top, single_choice_m)     # 所有的小框,行列等
             if len(row_col_dict) > 0:
 
                 if direction == 90:
@@ -503,8 +507,11 @@ def choice_m_row_col(image, choice_m_bbox_list, xml_path):
                     s_box_w_h.append(s_box_wid_hei)
         x_y_interval_arr = np.array(x_y_interval_all)
         if len(x_y_interval_arr) == 1:
+            x_y_interval_all_arr = np.array(x_y_interval_all)
+            x_ = int(np.mean(x_y_interval_all_arr[:, 0]))
+            y_ = int(np.mean(x_y_interval_all_arr[:, 1]))
+            x_y_interval_ave = (x_, y_)
 
-            x_y_interval_ave = x_y_interval_all[0][0]
             singe_box_width_height_ave = s_box_w_h[0]
 
             image_height, image_width, _ = image.shape
@@ -521,8 +528,8 @@ def choice_m_row_col(image, choice_m_bbox_list, xml_path):
         choice_m_dict_list_all_tmp = []
         for index, choice_box_ele in enumerate(choice_bbox):
             choice_region = utils.crop_region_direct(image, choice_box_ele[0])
-            choice_path = xml_path[: xml_path.rfind('\\')]
-            cv2.imwrite(os.path.join(choice_path, 'choice_region_' + str(index) + '.jpg'), choice_region)
+            # choice_path = xml_path[: xml_path.rfind('\\')]
+            # cv2.imwrite(os.path.join(choice_path, 'choice_region_' + str(index) + '.jpg'), choice_region)
             choice_m_box_dict_new = [choice_m_box_dict[i] for i in choice_box_ele[1]]
             choice_m_dict_list_part = get_title_number_by_choice_m.get_title_number(choice_box_ele[0], choice_region,
                                                                                      choice_m_box_dict_new, direction)

+ 4 - 3
segment/sheet_resolve/analysis/resolve.py

@@ -298,16 +298,17 @@ def exam_number_row_col(image, regions, xml_path):
         tree = utils.create_xml(name, tree,
                                 exam_number_box['xmin'], exam_number_box['ymin'],
                                 exam_number_box['xmax'], exam_number_box['ymax'])
+        tree.write(xml_path)
+        return [exam_number_row_col_dict]
 
     else:
         tree = utils.create_xml('exam_number', tree,
                                 exam_number_box['xmin'], exam_number_box['ymin'],
                                 exam_number_box['xmax'], exam_number_box['ymax'])
-        exam_number_row_col_dict = {}
 
-    tree.write(xml_path)
+        tree.write(xml_path)
 
-    return [exam_number_row_col_dict]
+        return []
 
 
 def cloze(image, regions, xml_path, conf_thresh, mns_thresh, cloze_sess):

+ 94 - 16
segment/sheet_resolve/analysis/sheet/choice_infer.py

@@ -263,16 +263,16 @@ def cluster_and_anti_abnormal(image, xml_path, digital_list, chars_list,
                               mean_height, mean_width, choice_s_height, choice_s_width, limit_loc):
     limit_left, limit_top, limit_right, limit_bottom = limit_loc
     limit_width, limit_height = limit_right - limit_left, limit_bottom - limit_top
-    arr = np.ones((len(digital_list), 2))
+    digital_loc_arr = np.ones((len(digital_list), 2))
     for i, ele in enumerate(digital_list):
-        arr[i] = np.array([ele["loc"][-2], ele["loc"][-1]])
+        digital_loc_arr[i] = np.array([ele["loc"][-2], ele["loc"][-1]])
 
     if choice_s_height != 0:
         eps = int(choice_s_height * 2.5)
     else:
         eps = int(mean_height * 3)
     print("eps: ", eps)
-    db = DBSCAN(eps=eps, min_samples=2, metric='chebyshev').fit(arr)
+    db = DBSCAN(eps=eps, min_samples=2, metric='chebyshev').fit(digital_loc_arr)
 
     labels = db.labels_
     # print(labels)
@@ -357,7 +357,7 @@ def cluster_and_anti_abnormal(image, xml_path, digital_list, chars_list,
                 current_height = current_loc[3] - current_loc[1]
 
                 infer_height = max((choice_m_mean_height - current_height), int(dif * current_height / current_len))
-                infer_bottom = min(current_loc[3] + infer_height, limit_height-1)
+                infer_bottom = min(current_loc[3] + infer_height, limit_height - 1)
                 if infer_bottom <= limit_height:
                     choice_m_numbers_list[e_index]["loc"][3] = infer_bottom
                     choice_m_numbers_list[e_index]["loc"][5] = (choice_m_numbers_list[e_index]["loc"][1] +
@@ -403,7 +403,7 @@ def cluster_and_anti_abnormal(image, xml_path, digital_list, chars_list,
         current_row_choice_m_d = sorted(current_row_choice_m_d, key=lambda x: x["loc"][0])
         # current_row_choice_m_d.append(choice_m_numbers_list[random_index])
         split_pix = sorted([ele["loc"][0] for ele in current_row_choice_m_d])  # xmin排序
-        split_index = get_split_index(split_pix, dif=choice_s_width*0.8)
+        split_index = get_split_index(split_pix, dif=choice_s_width * 0.8)
         split_pix = [split_pix[ele] for ele in split_index[:-1]]
 
         block_list = []
@@ -474,10 +474,10 @@ def cluster_and_anti_abnormal(image, xml_path, digital_list, chars_list,
                     choice_option = 'A,B,C,D'
                 else:
                     tmp = max(set(letter_index))
-                # while letter_index_times[tmp] < 2 and tmp > 3:
-                #     t_list = list(set(letter_index))
-                #     t_list.remove(tmp)
-                #     tmp = max(t_list)
+                    # while letter_index_times[tmp] < 2 and tmp > 3:
+                    #     t_list = list(set(letter_index))
+                    #     t_list.remove(tmp)
+                    #     tmp = max(t_list)
 
                     choice_option = ",".join(a_z[min(letter_index):tmp + 1])
                 cols = tmp
@@ -545,11 +545,89 @@ def cluster_and_anti_abnormal(image, xml_path, digital_list, chars_list,
             if ele in current_row_chars:
                 choice_m_numbers_list.remove(ele)
 
-    # 单独一行不聚类
+        # 解决单行问题
+        crt_right_max = max([int(ele['bounding_box']['xmax']) for ele in choice_m_list])
+        if limit_right - crt_right_max > choice_s_width:
+            # 存在区域
+            region_loc = {'xmin': crt_right_max + 10, 'ymin': choice_m_list[0]['bounding_box']['ymin'],
+                          'xmax': limit_right, 'ymax': choice_m_list[0]['bounding_box']['ymax']}
+
+            contain_dig = []
+            for i, ele in enumerate(digital_loc_arr):
+                if (region_loc['xmin'] < ele[0] + limit_left < region_loc['xmax']
+                        and region_loc['ymin'] < ele[1] + limit_top < region_loc['ymax']):
+                    contain_dig.append(digital_list[i])
+
+            contain_chars = [ele for ele in chars_list
+                             if region_loc['xmin'] < (
+                                     ele["location"]["left"] + ele["location"]["width"] // 2) + limit_left <
+                             region_loc['xmax']
+                             and
+                             region_loc['xmin'] < (
+                                     ele["location"]["top"] + ele["location"]["height"] // 2) + limit_top <
+                             region_loc['ymax']]
+            if contain_dig or contain_chars:
+                d_ymin, d_ymax, d_xmin, d_xmax = 9999, 0, 9999, 0
+                if contain_dig:
+                    d_ymin = min([ele['loc'][1] for ele in contain_dig])
+                    d_ymax = max([ele['loc'][3] for ele in contain_dig])
+                    d_xmin = min([ele['loc'][0] for ele in contain_dig])
+                    d_xmax = max([ele['loc'][2] for ele in contain_dig])
+
+                c_ymin, c_ymax, c_xmin, c_xmax = 9999, 0, 9999, 0
+                if contain_chars:
+                    c_ymin = min([ele["location"]["top"] for ele in contain_chars])
+                    c_ymax = max([ele["location"]["top"] + ele["location"]["height"] for ele in contain_chars])
+                    c_xmin = min([ele["location"]["left"] for ele in contain_chars])
+                    c_xmax = max([ele["location"]["left"] + ele["location"]["width"] for ele in contain_chars])
+
+                r_ymin, r_ymax = min(d_ymin, c_ymin), max(d_ymax, c_ymax)
+                r_xmin, r_xmax = min(d_xmin, c_xmin), max(d_xmax, c_xmax)
+
+                region_loc['ymin'] = r_ymin - 10 + limit_top
+                region_loc['ymax'] = r_ymax + 10 + limit_top
+                if d_xmin == r_xmin:
+                    region_loc['xmin'] = d_xmax + 5 + limit_left
+                    region_loc['xmax'] = d_xmax + 5 + limit_left + int(1.2 * choice_s_width)
+                else:
+                    if 1.2 * (r_xmax - r_xmin) > choice_s_width:
+                        region_loc['xmin'] = r_xmin - 10 + limit_left
+                        region_loc['xmax'] = r_xmax + 10 + limit_left
+                    else:
+                        region_loc['xmin'] = max((r_xmax - r_xmin) // 2 + r_xmin - choice_s_width + limit_left,
+                                                 crt_right_max + 10)
+                        region_loc['xmax'] = min((r_xmax - r_xmin) // 2 + r_xmin + choice_s_width + limit_left,
+                                                 limit_right)
+
+                try:
+                    choice_m_img = utils.crop_region(image, region_loc)
+                    right_loc, bottom_loc = adjust_choice_m(choice_m_img, mean_height, mean_width * 2)
+                    if right_loc > 0:
+                        region_loc.update(dict(xmax=right_loc + region_loc['xmin']))
+                    if bottom_loc > 0:
+                        region_loc.update(dict(ymax=bottom_loc + region_loc['ymin']))
+                except Exception as e:
+                    print(e)
+                    traceback.print_exc()
+
+                choice_m = dict(class_name='choice_m',
+                                number=[-1],
+                                bounding_box=region_loc,
+                                choice_option='A,B,C,D',
+                                default_points=[5],
+                                direction=180,
+                                cols=4,
+                                rows=1,
+                                single_width=(region_loc['xmax'] - region_loc['xmin']) // 4,
+                                single_height=r_ymax - r_ymin
+                                )
+                choice_m_list.append(choice_m)
+
+    # 单独一行不聚类(理论上不会再到这一步了, 上个block解决)
     for i, revised_choice_m in enumerate(need_revised_choice_m_list):
         loc = revised_choice_m['bounding_box']
         left_part_loc = loc.copy()
-        left_part_loc.update({'xmax': loc['xmin']+choice_s_width})
+        left_part_loc.update({'xmax': loc['xmin'] + choice_s_width})
         choice_m_img = utils.crop_region(image, left_part_loc)
         right_loc, bottom_loc = adjust_choice_m(choice_m_img, mean_height, mean_width * 2)
         if right_loc > 0:
@@ -561,7 +639,7 @@ def cluster_and_anti_abnormal(image, xml_path, digital_list, chars_list,
 
         right_part_loc = loc.copy()
         # right_part_loc.update({'xmin': loc['xmax']-choice_s_width})
-        right_part_loc.update({'xmin': left_part_loc['xmax']+5})
+        right_part_loc.update({'xmin': left_part_loc['xmax'] + 5})
         choice_m_img = utils.crop_region(image, right_part_loc)
         right_loc, bottom_loc = adjust_choice_m(choice_m_img, mean_height, mean_width * 2)
         if right_loc > 0:
@@ -572,7 +650,7 @@ def cluster_and_anti_abnormal(image, xml_path, digital_list, chars_list,
         right_tmp_height = right_part_loc['ymax'] - right_part_loc['ymin']
 
         number_len = max(1, int(revised_choice_m['rows'] // (left_tmp_height // right_tmp_height)))
-        number = [ele+revised_choice_m['number'][-1]+1 for ele in range(number_len)]
+        number = [ele + revised_choice_m['number'][-1] + 1 for ele in range(number_len)]
         rows = len(number)
 
         revised_choice_m.update({'bounding_box': left_part_loc})
@@ -582,11 +660,11 @@ def cluster_and_anti_abnormal(image, xml_path, digital_list, chars_list,
         tmp.update({'bounding_box': right_part_loc, 'number': number, 'rows': rows})
         choice_m_list.append(tmp)
 
-    tmp = choice_m_list.copy()
-    for ele in tmp:
+    choice_m_list_copy = choice_m_list.copy()
+    for ele in choice_m_list_copy:
         loc = ele["bounding_box"]
         w, h = loc['xmax'] - loc['xmin'], loc['ymax'] - loc['ymin']
-        if 2*w*h < choice_s_width*choice_s_height:
+        if 2 * w * h < choice_s_width * choice_s_height:
             choice_m_list.remove(ele)
     return choice_m_list
 

Разница между файлами не показана из-за своего большого размера
+ 296 - 1106
segment/sheet_resolve/analysis/sheet/ocr_key_words.py


+ 2 - 1
segment/sheet_resolve/analysis/sheet/sheet_points.py

@@ -198,7 +198,8 @@ def get_total_title_quantity_and_value(box_with_content):
                     if digital_value == None:
                         value = -1
                     else:
-                        value = digital_value[0]
+                        digital_value_ = digital_value.group()
+                        value = digital_value_[0]
                     title_two_value.append(int(value))
                 else:
                     title_two_number0 = result2[0]

+ 260 - 0
segment/sheet_resolve/analysis/sheet/sheet_points_by_nlp.py

@@ -0,0 +1,260 @@
+# File:get_sheet_points_by_nlp.py
+# Author:lynn
+# Date:2020/5/19 18:23
+
+
+import ast, cv2, re
+from segment.sheet_resolve.analysis.sheet.tag_parse import TagParse
+import CRFPP
+from segment.sheet_resolve.tools.utils import crop_region
+
+try:
+    import xml.etree.cElementTree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+
+def decide_coordinate_full_contains(coordinate1, coordinate2):
+    xmin1 = coordinate1[0]
+    ymin1 = coordinate1[1]
+    xmax1 = coordinate1[2]
+    ymax1 = coordinate1[3]
+    mid_x = int(xmin1 + (xmax1 - xmin1)//2)
+    mid_y = int(ymin1 + (ymax1 - ymin1)//2)
+
+    xmin2 = coordinate2[0]
+    ymin2 = coordinate2[1]
+    xmax2 = coordinate2[2]
+    ymax2 = coordinate2[3]
+
+    if xmin1 <= xmin2 and ymin1 <= ymin2 and xmax1 >= xmax2 and ymax1 >= ymax2:
+        return True
+    else:
+        return False
+
+
+def analyse_solve_solve0_result(result, ele):
+    key_words = ['T', 'C', 'S', 'N', 'O', 'E']
+    key_words_list = [ele[1] for ele in result]
+    move_m_list = [ele for ele in key_words_list if ele != 'M']
+    new_list = []
+    if 'O' not in key_words_list:
+        if 'N' in move_m_list and 'T' in move_m_list:
+            index_n = key_words_list.index('N')
+            index_t = key_words_list.index('T')
+            numeber_str = result[index_n][0]
+            number_pattern = re.findall('\d+', numeber_str)
+            title_number = int(number_pattern[0])
+            ele['number'] = title_number
+
+            total_score_str = result[index_t][0]
+            total_score_pattern = re.findall('\d+', total_score_str)
+            total_score = int(total_score_pattern[0])
+            ele['default_points'] = total_score
+            new_list.append(ele)
+        elif 'N' not in move_m_list and 'T' in move_m_list:
+            index_t = key_words_list.index('T')
+            total_score_str = result[index_t][0]
+            total_score_pattern = re.findall('\d+', total_score_str)
+            total_score = int(total_score_pattern[0])
+            ele['default_points'] = total_score
+            new_list.append(ele)
+        elif 'N' in move_m_list and 'T' not in move_m_list:
+            index_n = key_words_list.index('N')
+            numeber_str = result[index_n][0]
+            number_pattern = re.findall('\d+', numeber_str)
+            title_number = int(number_pattern[0])
+            ele['number'] = title_number
+            new_list.append(ele)
+    return new_list
+
+
+def analyse_choice_result(result, choice_m_list):
+    key_words_list = [ele[1] for ele in result]
+    move_m_list = [ele for ele in key_words_list if ele != 'M']
+    o_len = [ele for ele in key_words_list if ele == 'O']
+
+    if 'O' in key_words_list:
+        index_o = [index for index, ele in enumerate(key_words_list) if ele == 'O']
+        split_0_index = index_o
+        split_0_index.insert(-1, len(key_words_list))
+        split_0_index = sorted(list(set(split_0_index)))
+        split_by_o_list = []
+        for index, ele in enumerate(split_0_index):
+            if index == 0:
+                one_part = result[0: (split_0_index[index + 1]) - 1]
+                split_by_o_list.append(one_part)
+            elif ele == len(key_words_list):
+                break
+            else:
+                one_part = result[split_0_index[index]: (split_0_index[index + 1]) - 1]
+                split_by_o_list.append(one_part)
+        print(split_by_o_list)
+        number_with_value = []
+        number_with_value1 = []
+        for index0, ele0 in enumerate(split_by_o_list):
+            part_key_words = [ele[1] for ele in ele0]
+            index_oo = part_key_words.index('O')
+            index_ss = part_key_words.index('S')
+            contiue_number0 = ele0[index_oo]
+            number_list = []
+            if '-' in contiue_number0[0]:
+                number_list = contiue_number0[0].split('-')
+            elif '~' in contiue_number0[0]:
+                number_list = contiue_number0[0].split('~')
+
+            number_list = [int(ele) for ele in number_list]
+            number_list_all = [i for i in range(number_list[0], number_list[1] + 1)]
+            value = ele0[index_ss][0]
+            nlp_number_value_dict = {}
+            nlp_number_value_dict['number_list'] = number_list_all
+            nlp_number_value_dict['value'] = value
+            number_with_value1.append(nlp_number_value_dict)
+            for ele in number_list_all:
+                number_with_value.append({ele: value})
+        print(number_with_value)
+
+        for nlp_number in number_with_value1:
+            number_list_nlp0 = nlp_number['number_list']
+            value_nlp = nlp_number['value']
+            for choice_m_box in choice_m_list:
+                number_list_raw0 = choice_m_box['number']
+                decide_whether_inclue = [False for c in number_list_raw0 if c not in number_list_nlp0]
+                count_of_False = decide_whether_inclue.count(False)
+                rows = choice_m_box['rows']
+                if count_of_False / rows > 0.8:
+                    continue
+                elif count_of_False / rows <= 0.4:
+                    points_list = [float(value_nlp) for i in range(0, rows)]
+                    choice_m_box['default_points'] = points_list
+        return choice_m_list
+
+
+def analyse_cloze_result(result, cloze_and_cloze_s_list):
+    new_list = []
+    for cloze_and_cloze_s_ele in cloze_and_cloze_s_list:
+        cloze_s_info = cloze_and_cloze_s_ele['cloze_s_info']
+
+        key_words_list = [ele[1] for ele in result]
+        move_m_list = [ele for ele in key_words_list if ele != 'M']
+
+        for cloze_s_ele in cloze_s_info:
+            if 'S' in move_m_list and 'T' in move_m_list:
+                index_n = key_words_list.index('S')
+                value_str = result[index_n][0]
+                value_pattern = re.findall('\d+', value_str)
+                value = int(value_pattern[0])
+                cloze_s_ele['number'] = value
+
+            elif 'C' not in move_m_list and 'T' in move_m_list:
+                index_t = key_words_list.index('T')
+                index_c = key_words_list.index('C')
+
+                total_score_str = result[index_t][0]
+                total_score_pattern = re.findall('\d+', total_score_str)
+                total_score = int(total_score_pattern[0])
+                value_per = float(total_score / int(index_c))
+                cloze_s_ele['default_points'] = value_per
+                new_list.append(cloze_s_ele)
+    return new_list
+
+
+def analyse_cloze_result1(result, cloze_and_cloze_s_list):
+    new_list = []
+    for cloze_and_cloze_s_ele in cloze_and_cloze_s_list:
+        cloze_s_info = cloze_and_cloze_s_ele['cloze_s_info']
+
+        key_words_list = [ele[1] for ele in result]
+        move_m_list = [ele for ele in key_words_list if ele != 'M']
+
+        for cloze_s_ele in cloze_s_info:
+            if 'S' in move_m_list and 'T' in move_m_list:
+                index_n = key_words_list.index('S')
+                value_str = result[index_n][0]
+                value_pattern = re.findall('\d+', value_str)
+                value = int(value_pattern[0])
+                cloze_s_ele['number'] = value
+
+            elif 'C' not in move_m_list and 'T' in move_m_list:
+                index_t = key_words_list.index('T')
+                index_c = key_words_list.index('C')
+
+                total_score_str = result[index_t][0]
+                total_score_pattern = re.findall('\d+', total_score_str)
+                total_score = int(total_score_pattern[0])
+                value_per = float(total_score / int(index_c))
+                cloze_s_ele['default_points'] = value_per
+                new_list.append(cloze_s_ele)
+    return new_list
+
+
+def get_sheet_points_by_nlp(sheet_dict):
+    # json_path = r'C:\Users\Administrator\Desktop\type_score_nlp\type_score_info\example\english\33.json'
+    # file = open(json_path, 'r', encoding='gbk').read()
+    # json_file = ast.literal_eval(file)
+    regions = sheet_dict['regions']
+    ocr_list = []
+    new_list = []
+    choice_m_list = [ele for ele in regions if ele['class_name'] == 'choice_m']
+    cloze_list = [ele for ele in regions if ele['class_name'] == 'cloze']
+    cloze_and_cloze_s_list = []
+    for element_cloze in cloze_list:
+        cloze_box = element_cloze['bounding_box']
+        cloze_bbox = [cloze_box['xmin'], cloze_box['ymin'], cloze_box['xmax'], cloze_box['ymax']]
+        cloze_s_dict = {}
+        cloze_s_list = []
+        for element in regions:
+            if element['class_name'] == 'cloze_s':
+                cloze_s_box = element['bounding_box']
+                cloze_s_bbox = [cloze_s_box['xmin'], cloze_s_box['ymin'], cloze_s_box['xmax'], cloze_s_box['ymax']]
+
+                if decide_coordinate_full_contains(cloze_bbox, cloze_s_bbox) == True:
+                    cloze_s_list.append(element)
+        cloze_s_dict['cloze_info'] = element_cloze
+        cloze_s_dict['cloze_s_info'] = cloze_s_list
+        cloze_and_cloze_s_list.append(cloze_s_dict)
+    print(cloze_and_cloze_s_list)
+
+    for ele in regions:
+        if 'type_score_ocr' in ele:
+            ocr_list.append(ele)
+    for index, ele in enumerate(ocr_list):
+        ocr_content = ele['type_score_ocr']
+        taggers = CRFPP.Tagger("-m " + './segment/sheet_resolve/model/nlp_model/crf2.model')
+        tb = TagParse(taggers)
+        result = tb.get_tag_val(ocr_content)
+        if ele['class_name'] == 'cloze':
+            print(ele)
+
+            cloze_and_cloze_s_list0 = []
+            for ele1 in cloze_and_cloze_s_list:
+                print(ele1)
+                ele1_cloze = [ele1['cloze_info']['bounding_box']['xmin'], ele1['cloze_info']['bounding_box']['ymin'],
+                              ele1['cloze_info']['bounding_box']['xmax'], ele1['cloze_info']['bounding_box']['ymax']]
+                ele_cloze = [ele['bounding_box']['xmin'], ele['bounding_box']['ymin'],
+                             ele['bounding_box']['xmax'], ele['bounding_box']['ymax']]
+                if ele1_cloze == ele_cloze:
+                    cloze_and_cloze_s_list0.append(ele1)
+            new_list = analyse_cloze_result(result, cloze_and_cloze_s_list0)
+        elif ele['class_name'] == 'choice':
+            new_list = analyse_choice_result(result, choice_m_list)
+        else:
+            new_list = analyse_solve_solve0_result(result, ele)
+
+    for index0, ele0 in enumerate(regions):
+        for index1, ele1 in enumerate(new_list):
+            class_name0 = ele0['class_name']
+            bounding_box0 = ele0['bounding_box']
+
+            class_name1 = ele1['class_name']
+            bounding_box1 = ele1['bounding_box']
+            if class_name0 == class_name1 and bounding_box0 == bounding_box1:
+                ele0['default_points'] = ele1['default_points']
+
+    # pop type_score_ocr
+    for ele in regions:
+        if 'type_score_ocr' in ele:
+            ele.pop('type_score_ocr')
+
+    sheet_dict.update({'regions': regions})
+    return sheet_dict

+ 265 - 285
segment/sheet_resolve/analysis/sheet/sheet_points_total.py

@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
-# @Time : 2020/5/22 0022 17:02
+# @Time : 2020/5/28 0022 17:02
 # @Author : LF
 # @FileName: sheet_points_total.py
 # @Software: PyCharm
+# local_baidu_OCR
 
 import requests
 import base64
@@ -14,10 +15,10 @@ from PIL import Image
 from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate_in_google_format
 from segment.sheet_resolve.analysis.sheet.ocr_key_words import key_words
 
-try:
-    import tr
-except Exception:
-    pass
+# try:
+#     import tr
+# except Exception:
+#     pass
 
 OCR_ACCURACY = 'accurate'
 
@@ -360,7 +361,6 @@ def get_sheet_number_total(answer_sheet, res, img0):
 
     '''解析type_score与对应分割模块的分数'''
     for i in range(len(type_score_boxs)):
-        type_score_flag = 1
         test_result1 = model_type_score(type_score_boxs[i], choice_boxs, cloze_boxs, solve_boxs, composition_boxs)
         if test_result1 != -1 and test_result1 != 0:
             if type_score_boxs[i][0] - 5 > 0:
@@ -380,19 +380,19 @@ def get_sheet_number_total(answer_sheet, res, img0):
             else:
                 ymaxss = type_score_boxs[i][3]
             test_result1['words'] = str()
-            try:  # tr_OCR
-                print('tr_OCR')
-                image_src_type_score = image_src.crop((xminss, yminss, xmaxss, ymaxss))
-                type_score_dict_ocr = tr.run(image_src_type_score)
-                for t in range(len(type_score_dict_ocr)):
-                    test_result1['words'] = test_result1['words'] + type_score_dict_ocr[t][1]
-            except Exception as e:  # baidu_OCR
-                print('baidu_OCR')
-                type_score_dict_ocr = get_ocr_text_and_coordinate_in_google_format(img0[yminss:ymaxss, xminss:xmaxss], ocr_accuracy=OCR_ACCURACY,language_type='CHN_ENG')
-                for t in range(len(type_score_dict_ocr['words'])):
-                    test_result1['words'] = test_result1['words'] + type_score_dict_ocr['words'][t]
-
-            test = key_words(test_result1, type_score_flag)
+            # try:  # tr_OCR
+            #     image_src_type_score = image_src.crop((xminss, yminss, xmaxss, ymaxss))
+            #     type_score_dict_ocr = tr.run(image_src_type_score)
+            #     print('tr_OCR')
+            #     for t in range(len(type_score_dict_ocr)):
+            #         test_result1['words'] = test_result1['words'] + type_score_dict_ocr[t][1]
+            # except Exception as e:  # baidu_OCR
+            #     print('baidu_OCR')
+            type_score_dict_ocr = get_ocr_text_and_coordinate_in_google_format(img0[yminss:ymaxss, xminss:xmaxss], ocr_accuracy=OCR_ACCURACY,language_type='CHN_ENG')
+            for t in range(len(type_score_dict_ocr['words'])):
+                test_result1['words'] = test_result1['words'] + type_score_dict_ocr['words'][t]
+
+            test = key_words(test_result1)
             if test == {}:
                 ### 添加返回值OCR结果
                 add_ocr = {}
@@ -443,7 +443,6 @@ def get_sheet_number_total(answer_sheet, res, img0):
                 if solve_boxs.count(all_test[jjjj]['Score_structure'][0]['bounding_box']):
                     solve_boxs.remove(all_test[jjjj]['Score_structure'][0]['bounding_box'])
     if choice_boxs != []:  # 9月16号修改
-        type_score_flag = 0
         for ij in range(len(choice_boxs)):
             if choice_boxs[ij][1] - 150 > 0:
                 yminss = choice_boxs[ij][1] - 150
@@ -453,52 +452,59 @@ def get_sheet_number_total(answer_sheet, res, img0):
                 xminss = choice_boxs[ij][0] - 100
             else:
                 xminss = choice_boxs[ij][0]
-            if yminss + 200 < img_h:
-                ymaxss = yminss + 200
-            else:
-                ymaxss = choice_boxs[ij][3]
-            type_score_dict_ocrs = {}
-            new_test = {}
-
-            try:  # tr_OCR
-                print('tr_OCR')
-                image_choice = image_src.crop((xminss, yminss, choice_boxs[ij][2], ymaxss))
-                res1 = tr.run(image_choice)
-                for i in range(len(res1)):
-                    if res1[i][1].find('分') != -1:
-                        type_score_dict_ocrs['words'] = res1[i][1]
-                    else:
-                        continue
-            except Exception as e:  # baidu_OCR
-                print('baidu_OCR')
-                res1 = get_ocr_text_and_coordinate_in_google_format(img0[yminss:ymaxss, xminss:choice_boxs[ij][2]], ocr_accuracy=OCR_ACCURACY,language_type='CHN_ENG')
-                for i in range(len(res1['words'])):
-                    if res1['words'][i].find('分') != -1:
-                        type_score_dict_ocrs['words'] = res1['words'][i]
-                    else:
-                        continue
-            if type_score_dict_ocrs != {}:
-                new_test = key_words(type_score_dict_ocrs, type_score_flag)
-            if new_test != {} and new_test['volume_structure'] != -1 and (
-                    int(new_test['volume_structure'][0]['volume_total_score']) > 4 or int(
-                    new_test['volume_structure'][0]['volume_score']) > 4):  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
-                if int(new_test['volume_structure'][0]['volume_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
-                    new_test['volume_structure'][0]['volume_total_score'] = int(
-                        new_test['volume_structure'][0]['volume_total_score']) % 100
-                new_test['volume_structure'][0]['bounding_box'] = choice_boxs[ij]
-                new_test['volume_structure'][0]['label'] = 'choice'
-                all_test.append(new_test)
-            elif new_test != {} and new_test['volume_structure'] == -1 and new_test['Score_structure'] != -1 and (
-                    int(new_test['Score_structure'][0]['item_total_score']) > 4 or int(
-                    new_test['Score_structure'][0]['item_score']) > 4):  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
-                if int(new_test['Score_structure'][0]['item_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
-                    new_test['Score_structure'][0]['item_total_score'] = int(
-                        new_test['Score_structure'][0]['item_total_score']) % 100
-                new_test['Score_structure'][0]['bounding_box'] = choice_boxs[ij]
-                new_test['Score_structure'][0]['label'] = 'choice'
-                all_test.append(new_test)
+            try:
+                res1 = get_ocr_text_and_coordinate_in_google_format(img0[yminss:choice_boxs[ij][3], xminss:choice_boxs[ij][2]], ocr_accuracy=OCR_ACCURACY,language_type='CHN_ENG')
+                aa = []
+                type_score_dict_ocrs = {}
+                for ii in range(len(res1['coordinates'])):
+                    xmin11 = res1['coordinates'][ii][0] + choice_boxs[ij][0]
+                    ymin11 = res1['coordinates'][ii][1] + choice_boxs[ij][1]
+                    xmax11 = res1['coordinates'][ii][2] + choice_boxs[ij][0]
+                    ymax11 = res1['coordinates'][ii][3] + choice_boxs[ij][1]
+                    aaa = (xmin11, ymin11, xmax11, ymax11)
+                    aa.append(aaa)
+                res1['coordinates'] = aa
+                new_test = {}
+                if len(res1['words']) > 0:
+                    type_score_dict_ocrs['words'] = res1['words'][0]
+                    new_test = key_words(type_score_dict_ocrs)
+                    if new_test == {} or new_test['Score_structure'] == -1:
+                        if len(res1['words']) > 1:
+                            type_score_dict_ocrs['words'] = res1['words'][1]
+                            new_test = key_words(type_score_dict_ocrs)
+                            if new_test == {} or new_test['Score_structure'] == -1:
+                                if len(res1['words']) > 2:
+                                    type_score_dict_ocrs['words'] = res1['words'][2]
+                                    new_test = key_words(type_score_dict_ocrs)
+                                if new_test == {} or new_test['Score_structure'] == -1:
+                                    if len(res1['words']) > 3:
+                                        type_score_dict_ocrs['words'] = res1['words'][3]
+                                        new_test = key_words(type_score_dict_ocrs)
+                                    if new_test == {} or new_test['Score_structure'] == -1:
+                                        if len(res1['words']) > 4:
+                                            type_score_dict_ocrs['words'] = res1['words'][4]
+                                            new_test = key_words(type_score_dict_ocrs)
+                if new_test != {} and new_test['volume_structure'] != -1 and (
+                        int(new_test['volume_structure'][0]['volume_total_score']) > 4 or int(
+                        new_test['volume_structure'][0]['volume_score']) > 4):  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
+                    if int(new_test['volume_structure'][0]['volume_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
+                        new_test['volume_structure'][0]['volume_total_score'] = int(
+                            new_test['volume_structure'][0]['volume_total_score']) % 100
+                    new_test['volume_structure'][0]['bounding_box'] = choice_boxs[ij]
+                    new_test['volume_structure'][0]['label'] = 'choice'
+                    all_test.append(new_test)
+                elif new_test != {} and new_test['volume_structure'] == -1 and new_test['Score_structure'] != -1 and (
+                        int(new_test['Score_structure'][0]['item_total_score']) > 4 or int(
+                        new_test['Score_structure'][0]['item_score']) > 4):  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
+                    if int(new_test['Score_structure'][0]['item_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
+                        new_test['Score_structure'][0]['item_total_score'] = int(
+                            new_test['Score_structure'][0]['item_total_score']) % 100
+                    new_test['Score_structure'][0]['bounding_box'] = choice_boxs[ij]
+                    new_test['Score_structure'][0]['label'] = 'choice'
+                    all_test.append(new_test)
+            except Exception:
+                print('choice_boxs_score_NULL_or_error')
     if cloze_boxs != []:
-        type_score_flag = 0
         for ij in range(len(cloze_boxs)):
             if cloze_boxs[ij][1] - 100 > 0:
                 yminss = cloze_boxs[ij][1] - 100
@@ -508,115 +514,107 @@ def get_sheet_number_total(answer_sheet, res, img0):
                 xminss = cloze_boxs[ij][0] - 100
             else:
                 xminss = cloze_boxs[ij][0]
-            type_score_dict_ocrs = {}
-            new_test = {}
-
-            try:  # tr_OCR
-                print('tr_OCR')
-                image_choice = image_src.crop((xminss, yminss, cloze_boxs[ij][2], cloze_boxs[ij][3]))
-                res1 = tr.run(image_choice)
-                for i in range(len(res1)):
-                    if res1[i][1].find('分') != -1:
-                        type_score_dict_ocrs['words'] = res1[i][1]
-                    else:
-                        continue
-            except Exception as e:  # baidu_OCR
-                print('baidu_OCR')
-                res1 = get_ocr_text_and_coordinate_in_google_format(
-                    img0[yminss:cloze_boxs[ij][3], xminss:cloze_boxs[ij][2]], ocr_accuracy=OCR_ACCURACY,
-                    language_type='CHN_ENG')
-                for i in range(len(res1['words'])):
-                    if res1['words'][i].find('分') != -1:
-                        type_score_dict_ocrs['words'] = res1['words'][i]
-                    else:
-                        continue
-            if type_score_dict_ocrs != {}:
-                new_test = key_words(type_score_dict_ocrs, type_score_flag)
-            if new_test != {} and new_test['volume_structure'] != -1 and (int(new_test['volume_structure'][0]['volume_total_score']) > 4 or int(new_test['volume_structure'][0]['volume_score']) > 4):  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
-                if int(new_test['volume_structure'][0]['volume_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
-                    new_test['volume_structure'][0]['volume_total_score'] = int(
-                        new_test['volume_structure'][0]['volume_total_score']) % 100
-                new_test['volume_structure'][0]['bounding_box'] = cloze_boxs[ij]
-                new_test['volume_structure'][0]['label'] = 'cloze'
-                all_test.append(new_test)
-            elif new_test != {} and new_test['volume_structure'] == -1 and new_test['Score_structure'] != -1 and (int(new_test['Score_structure'][0]['item_total_score']) > 4 or int(new_test['Score_structure'][0]['item_score']) > 4):  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
-                if int(new_test['Score_structure'][0]['item_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
-                    new_test['Score_structure'][0]['item_total_score'] = int(
-                        new_test['Score_structure'][0]['item_total_score']) % 100
-                new_test['Score_structure'][0]['bounding_box'] = cloze_boxs[ij]
-                new_test['Score_structure'][0]['label'] = 'cloze'
-                all_test.append(new_test)
+            try:
+                res1 = get_ocr_text_and_coordinate_in_google_format(img0[yminss:cloze_boxs[ij][3], xminss:cloze_boxs[ij][2]], ocr_accuracy=OCR_ACCURACY,language_type='CHN_ENG')
+                aa = []
+                type_score_dict_ocrs = {}
+                for ii in range(len(res1['coordinates'])):
+                    xmin11 = res1['coordinates'][ii][0] + cloze_boxs[ij][0]
+                    ymin11 = res1['coordinates'][ii][1] + cloze_boxs[ij][1]
+                    xmax11 = res1['coordinates'][ii][2] + cloze_boxs[ij][0]
+                    ymax11 = res1['coordinates'][ii][3] + cloze_boxs[ij][1]
+                    aaa = (xmin11, ymin11, xmax11, ymax11)
+                    aa.append(aaa)
+                res1['coordinates'] = aa
+                new_test = {}
+                if len(res1['words']) > 0:
+                    type_score_dict_ocrs['words'] = res1['words'][0]
+                    new_test = key_words(type_score_dict_ocrs)
+                    if new_test == {} or new_test['Score_structure'] == -1:
+                        if len(res1['words']) > 1:
+                            type_score_dict_ocrs['words'] = res1['words'][1]
+                            new_test = key_words(type_score_dict_ocrs)
+                            if new_test == {} or new_test['Score_structure'] == -1:
+                                if len(res1['words']) > 2:
+                                    type_score_dict_ocrs['words'] = res1['words'][2]
+                                    new_test = key_words(type_score_dict_ocrs)
+                                if new_test == {} or new_test['Score_structure'] == -1:
+                                    if len(res1['words']) > 3:
+                                        type_score_dict_ocrs['words'] = res1['words'][3]
+                                        new_test = key_words(type_score_dict_ocrs)
+                                    if new_test == {} or new_test['Score_structure'] == -1:
+                                        if len(res1['words']) > 4:
+                                            type_score_dict_ocrs['words'] = res1['words'][4]
+                                            new_test = key_words(type_score_dict_ocrs)
+                if new_test != {} and new_test['volume_structure'] != -1 and (int(new_test['volume_structure'][0]['volume_total_score']) > 4 or int(new_test['volume_structure'][0]['volume_score']) > 4):  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
+                    if int(new_test['volume_structure'][0]['volume_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
+                        new_test['volume_structure'][0]['volume_total_score'] = int(
+                            new_test['volume_structure'][0]['volume_total_score']) % 100
+                    new_test['volume_structure'][0]['bounding_box'] = cloze_boxs[ij]
+                    new_test['volume_structure'][0]['label'] = 'cloze'
+                    all_test.append(new_test)
+                elif new_test != {} and new_test['volume_structure'] == -1 and new_test['Score_structure'] != -1 and (int(new_test['Score_structure'][0]['item_total_score']) > 4 or int(new_test['Score_structure'][0]['item_score']) > 4):  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
+                    if int(new_test['Score_structure'][0]['item_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
+                        new_test['Score_structure'][0]['item_total_score'] = int(
+                            new_test['Score_structure'][0]['item_total_score']) % 100
+                    new_test['Score_structure'][0]['bounding_box'] = cloze_boxs[ij]
+                    new_test['Score_structure'][0]['label'] = 'cloze'
+                    all_test.append(new_test)
+            except Exception:
+                print('cloze_boxs_score_NULL_or_error')
     if solve_boxs != []:
-        type_score_flag = 0
         for ij in range(len(solve_boxs)):
-            xminss = solve_boxs[ij][0]
             yminss = solve_boxs[ij][1]
-            if solve_boxs[ij][2] - xminss > 1000:
-                xmaxss = xminss + 1000
-            else:
-                xmaxss = solve_boxs[ij][2]
-            if yminss + 500 > img_h:
-                ymaxss = yminss + 500
-            else:
-                ymaxss = solve_boxs[ij][3]
-            type_score_dict_ocrs = {}
-            new_test = {}
-
-            try:  # tr_OCR
-                print('tr_OCR')
-                image_choice = image_src.crop((xminss, yminss, xmaxss, ymaxss))
-                res1 = tr.run(image_choice)
-                for i in range(len(res1)):
-                    if res1[i][1].find('分') != -1:
-                        type_score_dict_ocrs['words'] = res1[i][1]
-                    elif i == len(res1)-1:
-                        for ii in range(len(res1)):
-                            if res1[ii][1].find('题') != -1 or res1[ii][1].find('.') != -1 or res1[ii][1].find('、') != -1:
-                                type_score_dict_ocrs['words'] = res1[ii][1]
-                            else:
-                                continue
-                    else:
-                        continue
-            except Exception as e:  # baidu_OCR
-                print('baidu_OCR')
-                res1 = get_ocr_text_and_coordinate_in_google_format(
-                    img0[yminss:ymaxss, xminss:xmaxss], ocr_accuracy=OCR_ACCURACY,
-                    language_type='CHN_ENG')
-                for i in range(len(res1['words'])):
-                    if res1['words'][i].find('分') != -1:
-                        type_score_dict_ocrs['words'] = res1['words'][i]
-                    elif i == len(res1['words'])-1:
-                        for ii in range(len(res1['words'])):
-                            if res1['words'][ii].find('题') != -1 or res1['words'][ii][1].find('.') != -1 or res1['words'][ii].find('、') != -1:
-                                type_score_dict_ocrs['words'] = res1['words'][ii]
-                            else:
-                                continue
-                    else:
-                        continue
-            if type_score_dict_ocrs != {}:
-                new_test = key_words(type_score_dict_ocrs, type_score_flag)
-            if new_test != {} and new_test['volume_structure'] != -1 and int(new_test['volume_structure'][0][
-                                                                                 'volume_total_score']) > 5:  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
-                if int(new_test['volume_structure'][0][
-                           'volume_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
-                    new_test['volume_structure'][0]['volume_total_score'] = int(
-                        new_test['volume_structure'][0]['volume_total_score']) % 100
-                new_test['volume_structure'][0]['bounding_box'] = solve_boxs[ij]
-                new_test['volume_structure'][0]['label'] = 'solve'
-                all_test.append(new_test)
-            elif new_test != {} and new_test['volume_structure'] == -1 and new_test[
-                'Score_structure'] != -1 and (
-                    int(new_test['Score_structure'][0]['item_total_score']) > 5 or int(
-                    new_test['Score_structure'][0][
-                        'item_total_score']) == -1):  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
-                if int(new_test['Score_structure'][0]['item_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
-                    new_test['Score_structure'][0]['item_total_score'] = int(
-                        new_test['Score_structure'][0]['item_total_score']) % 100
-                new_test['Score_structure'][0]['bounding_box'] = solve_boxs[ij]
-                new_test['Score_structure'][0]['label'] = 'solve'
-                all_test.append(new_test)
+            xminss = solve_boxs[ij][0]
+            try:
+                res1 = get_ocr_text_and_coordinate_in_google_format(img0[yminss:solve_boxs[ij][3], xminss:solve_boxs[ij][2]], ocr_accuracy=OCR_ACCURACY,language_type='CHN_ENG')
+                aa = []
+                type_score_dict_ocrs = {}
+                for ii in range(len(res1['coordinates'])):
+                    xmin11 = res1['coordinates'][ii][0] + solve_boxs[ij][0]
+                    ymin11 = res1['coordinates'][ii][1] + solve_boxs[ij][1]
+                    xmax11 = res1['coordinates'][ii][2] + solve_boxs[ij][0]
+                    ymax11 = res1['coordinates'][ii][3] + solve_boxs[ij][1]
+                    aaa = (xmin11, ymin11, xmax11, ymax11)
+                    aa.append(aaa)
+                res1['coordinates'] = aa
+                new_test = {}
+                if len(res1['words']) > 0:
+                    type_score_dict_ocrs['words'] = res1['words'][0]
+                    new_test = key_words(type_score_dict_ocrs)
+                    if new_test == {} or new_test['Score_structure'] == -1:
+                        if len(res1['words']) > 1:
+                            type_score_dict_ocrs['words'] = res1['words'][1]
+                            new_test = key_words(type_score_dict_ocrs)
+                            if new_test == {} or new_test['Score_structure'] == -1:
+                                if len(res1['words']) > 2:
+                                    type_score_dict_ocrs['words'] = res1['words'][2]
+                                    new_test = key_words(type_score_dict_ocrs)
+                                if new_test == {} or new_test['Score_structure'] == -1:
+                                    if len(res1['words']) > 3:
+                                        type_score_dict_ocrs['words'] = res1['words'][3]
+                                        new_test = key_words(type_score_dict_ocrs)
+                                    if new_test == {} or new_test['Score_structure'] == -1:
+                                        if len(res1['words']) > 4:
+                                            type_score_dict_ocrs['words'] = res1['words'][4]
+                                            new_test = key_words(type_score_dict_ocrs)
+                if new_test != {} and new_test['volume_structure'] != -1 and int(new_test['volume_structure'][0]['volume_total_score']) > 5:  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
+                    if int(new_test['volume_structure'][0]['volume_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
+                        new_test['volume_structure'][0]['volume_total_score'] = int(new_test['volume_structure'][0]['volume_total_score']) % 100
+                    new_test['volume_structure'][0]['bounding_box'] = solve_boxs[ij]
+                    new_test['volume_structure'][0]['label'] = 'solve'
+                    all_test.append(new_test)
+                elif new_test != {} and new_test['volume_structure'] == -1 and new_test[
+                    'Score_structure'] != -1 and (
+                        int(new_test['Score_structure'][0]['item_total_score']) > 5 or int(new_test['Score_structure'][0]['item_total_score']) == -1):  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
+                    if int(new_test['Score_structure'][0]['item_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
+                        new_test['Score_structure'][0]['item_total_score'] = int(new_test['Score_structure'][0]['item_total_score']) % 100
+                    new_test['Score_structure'][0]['bounding_box'] = solve_boxs[ij]
+                    new_test['Score_structure'][0]['label'] = 'solve'
+                    all_test.append(new_test)
+            except Exception:
+                print('solve_boxs_score_NULL_or_error')
     if composition_boxs != []:
-        type_score_flag = 0
         for ij in range(len(composition_boxs)):
             if composition_boxs[ij][1] - 250 > 0:
                 yminss = composition_boxs[ij][1] - 250
@@ -626,55 +624,57 @@ def get_sheet_number_total(answer_sheet, res, img0):
                 xminss = composition_boxs[ij][0] - 100
             else:
                 xminss = composition_boxs[ij][0]
-            type_score_dict_ocrs = {}
-
-            try:  # tr_OCR
-                print('tr_OCR')
-                image_choice = image_src.crop((xminss, yminss, composition_boxs[ij][2], composition_boxs[ij][3]))
-                res1 = tr.run(image_choice)
-                for i in range(len(res1)):
-                    if res1[i][1].find('分') != -1:
-                        type_score_dict_ocrs['words'] = res1[i][1]
-                    elif i == len(res1):
-                        for ii in range(len(res1)):
-                            if res1[i][1].find('题') != -1 or res1[i][1].find('.') != -1 or res1[i][1].find('、') != -1:
-                                type_score_dict_ocrs['words'] = res1[i][1]
-                            else:
-                                continue
-            except Exception as e:  # baidu_OCR
-                print('baidu_OCR')
-                res1 = get_ocr_text_and_coordinate_in_google_format(
-                    img0[yminss:composition_boxs[ij][3], xminss:composition_boxs[ij][2]], ocr_accuracy=OCR_ACCURACY,
-                    language_type='CHN_ENG')
-                for i in range(len(res1['words'])):
-                    if res1['words'][i].find('分') != -1:
-                        type_score_dict_ocrs['words'] = res1['words'][i]
-                    elif i == len(res1):
-                        for ii in range(len(res1['words'])):
-                            if res1['words'][i].find('题') != -1 or res1['words'][i][1].find('.') != -1 or res1['words'][
-                                i].find('、') != -1:
-                                type_score_dict_ocrs['words'] = res1['words'][i]
-                            else:
-                                continue
-            if type_score_dict_ocrs != {}:
-                new_test = key_words(type_score_dict_ocrs, type_score_flag)
-            if new_test != {} and new_test['volume_structure'] != -1 and int(
-                    new_test['volume_structure'][0]['volume_total_score']) > 4:  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
-                if int(new_test['volume_structure'][0]['volume_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
-                    new_test['volume_structure'][0]['volume_total_score'] = int(
-                        new_test['volume_structure'][0]['volume_total_score']) % 100
-                new_test['volume_structure'][0]['bounding_box'] = composition_boxs[ij]
-                new_test['volume_structure'][0]['label'] = 'composition'
-                all_test.append(new_test)
-            elif new_test != {} and new_test['volume_structure'] == -1 and new_test[
-                'Score_structure'] != -1 and int(
-                new_test['Score_structure'][0]['item_total_score']) > 4:  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
-                if int(new_test['Score_structure'][0]['item_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
-                    new_test['Score_structure'][0]['item_total_score'] = int(
-                        new_test['Score_structure'][0]['item_total_score']) % 100
-                new_test['Score_structure'][0]['bounding_box'] = composition_boxs[ij]
-                new_test['Score_structure'][0]['label'] = 'composition'
-                all_test.append(new_test)
+            try:
+                res1 = get_ocr_text_and_coordinate_in_google_format(img0[yminss:composition_boxs[ij][3], xminss:composition_boxs[ij][2]], ocr_accuracy=OCR_ACCURACY,language_type='CHN_ENG')
+                aa = []
+                type_score_dict_ocrs = {}
+                for ii in range(len(res1['coordinates'])):
+                    xmin11 = res1['coordinates'][ii][0] + composition_boxs[ij][0]
+                    ymin11 = res1['coordinates'][ii][1] + composition_boxs[ij][1]
+                    xmax11 = res1['coordinates'][ii][2] + composition_boxs[ij][0]
+                    ymax11 = res1['coordinates'][ii][3] + composition_boxs[ij][1]
+                    aaa = (xmin11, ymin11, xmax11, ymax11)
+                    aa.append(aaa)
+                res1['coordinates'] = aa
+                new_test = {}
+                if len(res1['words']) > 0:
+                    type_score_dict_ocrs['words'] = res1['words'][0]
+                    new_test = key_words(type_score_dict_ocrs)
+                    if new_test == {} or new_test['Score_structure'] == -1:
+                        if len(res1['words']) > 1:
+                            type_score_dict_ocrs['words'] = res1['words'][1]
+                            new_test = key_words(type_score_dict_ocrs)
+                            if new_test == {} or new_test['Score_structure'] == -1:
+                                if len(res1['words']) > 2:
+                                    type_score_dict_ocrs['words'] = res1['words'][2]
+                                    new_test = key_words(type_score_dict_ocrs)
+                                if new_test == {} or new_test['Score_structure'] == -1:
+                                    if len(res1['words']) > 3:
+                                        type_score_dict_ocrs['words'] = res1['words'][3]
+                                        new_test = key_words(type_score_dict_ocrs)
+                                    if new_test == {} or new_test['Score_structure'] == -1:
+                                        if len(res1['words']) > 4:
+                                            type_score_dict_ocrs['words'] = res1['words'][4]
+                                            new_test = key_words(type_score_dict_ocrs)
+                if new_test != {} and new_test['volume_structure'] != -1 and int(
+                        new_test['volume_structure'][0]['volume_total_score']) > 4:  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
+                    if int(new_test['volume_structure'][0]['volume_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
+                        new_test['volume_structure'][0]['volume_total_score'] = int(
+                            new_test['volume_structure'][0]['volume_total_score']) % 100
+                    new_test['volume_structure'][0]['bounding_box'] = composition_boxs[ij]
+                    new_test['volume_structure'][0]['label'] = 'composition'
+                    all_test.append(new_test)
+                elif new_test != {} and new_test['volume_structure'] == -1 and new_test[
+                    'Score_structure'] != -1 and int(
+                    new_test['Score_structure'][0]['item_total_score']) > 4:  # 如果识别到分数,添加到输出信息;如果还没有识别到分数,默认没有分数
+                    if int(new_test['Score_structure'][0]['item_total_score']) > 200:  # 暂定试卷分数都在200以内,超过200的表示识别错误
+                        new_test['Score_structure'][0]['item_total_score'] = int(
+                            new_test['Score_structure'][0]['item_total_score']) % 100
+                    new_test['Score_structure'][0]['bounding_box'] = composition_boxs[ij]
+                    new_test['Score_structure'][0]['label'] = 'composition'
+                    all_test.append(new_test)
+            except Exception:
+                print('composition_boxs_score_NULL_or_error')
     for aaa in range(len(all_test)):
         if all_test[aaa]['Score_structure'] != -1 and all_test[aaa]['volume_structure'] == -1:
             score_last_one = {'model_box': dict(all_test[aaa])['Score_structure'][0]['bounding_box'],
@@ -793,17 +793,20 @@ def get_sheet_number_total(answer_sheet, res, img0):
                                 count_choice_m = count_choice_m + len(answer_sheet['regions'][j]['number'])
                                 j_temp.append(j)
                         if j == len(answer_sheet['regions']) - 1 and j_temp !=[]:
-                            for index, jj in enumerate(j_temp):
-                                num_score_m = round(float(Score_last[i]['score'] / count_choice_m),1)
-                                answer_sheet['regions'][jj]['default_points'] = len(answer_sheet['regions'][jj]['number']) * [num_score_m]
-                            break
-
+                            try:
+                                for index, jj in enumerate(j_temp):
+                                    num_score_m_infer = round(float(Score_last[i]['score'] / count_choice_m), 2)
+                                    num_score_m = [str(num_score_m_infer), int(num_score_m_infer)][int(num_score_m_infer) == num_score_m_infer]
+                                    answer_sheet['regions'][jj]['default_points'] = len(answer_sheet['regions'][jj]['number']) * [num_score_m]
+                                break
+                            except Exception:
+                                pass
                 elif Score_last[i]['label'] == 'cloze':
-                    count_cloze_s = 0
                     for j in range(len(answer_sheet['regions'])):
                         if answer_sheet['regions'][j]['class_name'] == 'cloze_s':
                             if Score_last[i]['number_score'] != -1:
                                 answer_sheet['regions'][j]['default_points'] = Score_last[i]['number_score']
+
         elif num_choice > 1 or num_cloze >1:
             for i in range(len(Score_last)):
                 if Score_last[i]['label'] == 'choice':
@@ -825,12 +828,16 @@ def get_sheet_number_total(answer_sheet, res, img0):
                                     count_choice_m = count_choice_m + len(answer_sheet['regions'][j]['number'])
                                     j_temp.append(j)
                         if j == len(answer_sheet['regions']) - 1 and j_temp !=[]:
-                            for index ,jj in enumerate(j_temp):
-                                num_score_m = round(float(Score_last[i]['score'] / count_choice_m),1)
-                                answer_sheet['regions'][jj]['default_points'] = len(answer_sheet['regions'][jj]['number']) * [num_score_m]
-                            break
+                            try:
+                                for index, jj in enumerate(j_temp):
+                                    num_score_m_infer = round(float(Score_last[i]['score'] / count_choice_m), 2)
+                                    num_score_m = [str(num_score_m_infer), int(num_score_m_infer)][int(num_score_m_infer) == num_score_m_infer]
+                                    answer_sheet['regions'][jj]['default_points'] = len(answer_sheet['regions'][jj]['number']) * [num_score_m]
+                                break
+                            except Exception:
+                                pass
+
                 elif Score_last[i]['label'] == 'cloze':
-                    count_cloze_s = 0
                     for j in range(len(answer_sheet['regions'])):
                         if answer_sheet['regions'][j]['class_name'] == 'cloze_s':
                             xmin_dis = answer_sheet['regions'][j]['bounding_box']['xmin'] - \
@@ -844,6 +851,7 @@ def get_sheet_number_total(answer_sheet, res, img0):
                             if xmin_dis > -30 and ymin_dis > -30 and xmax_dis < 30 and ymax_dis < 30:
                                 if Score_last[i]['number_score'] != -1 :
                                     answer_sheet['regions'][j]['default_points'] = Score_last[i]['number_score']
+
         elif choice_m_boxs !=[]:
             x_choice_m_min = 10000
             y_choice_m_min = 10000
@@ -870,63 +878,26 @@ def get_sheet_number_total(answer_sheet, res, img0):
                                   'bounding_box': choice_m_boxs,
                                   'label': 'choice_m',
                                   'type_box': type_score_choice_m}
-                type_score_flag = 0
-                type_score_dict_ocrs = {}
-                try:  # tr_OCR
-                    print('tr_OCR')
-                    image_choice = image_src.crop((type_score_boxs[0][0], type_score_boxs[0][1], type_score_boxs[0][2], type_score_boxs[0][3]))
-                    res1 = tr.run(image_choice)
-                    for i in range(len(res1)):
-                        if res1[i][1].find('分') != -1:
-                            type_score_dict_ocrs['words'] = res1[i][1]
-                        elif i == len(res1):
-                            for ii in range(len(res1)):
-                                if res1[i][1].find('题') != -1 or res1[i][1].find('.') != -1 or res1[i][1].find(
-                                        '、') != -1:
-                                    type_score_dict_ocrs['words'] = res1[i][1]
-                                else:
-                                    continue
-                except Exception as e:  # baidu_OCR
-                    print('baidu_OCR')
-                    res1 = get_ocr_text_and_coordinate_in_google_format(
-                        img0[type_score_boxs[0][1]:type_score_boxs[0][3], type_score_boxs[0][0]:type_score_boxs[0][2]], ocr_accuracy=OCR_ACCURACY,
-                        language_type='CHN_ENG')
-                    for i in range(len(res1['words'])):
-                        if res1['words'][i].find('分') != -1:
-                            type_score_dict_ocrs['words'] = res1['words'][i]
-                        elif i == len(res1):
-                            for ii in range(len(res1['words'])):
-                                if res1['words'][i].find('题') != -1 or res1['words'][i][1].find('.') != -1 or \
-                                        res1['words'][
-                                            i].find('、') != -1:
-                                    type_score_dict_ocrs['words'] = res1['words'][i]
-                                else:
-                                    continue
-                if type_score_dict_ocrs != {}:
-                    test = key_words(type_score_dict_ocrs, type_score_flag)
+                test_result1['words'] = str()
+                # try:  # tr_OCR
+                #     image_choice = image_src.crop((type_score_choice_m[0], type_score_choice_m[1], type_score_choice_m[2], type_score_choice_m[3]))
+                #     res1 = tr.run(image_choice)
+                #     print('tr_OCR')
+                #     for t in range(len(res1)):
+                #         test_result1['words'] = test_result1['words'] + res1[t][1]
+                # except Exception as e:  # baidu_OCR
+                #     print('baidu_OCR')
+                res1 = get_ocr_text_and_coordinate_in_google_format(
+                    img0[type_score_choice_m[1]:type_score_choice_m[3], type_score_choice_m[0]:type_score_choice_m[2]], ocr_accuracy=OCR_ACCURACY, language_type='CHN_ENG')
+                for t in range(len(res1['words'])):
+                    test_result1['words'] = test_result1['words'] + res1['words'][t]
+                if test_result1['words'] != {}:
+                    test = key_words(test_result1)
                 choice_m_score = -1
                 if test == {}:
-                    ### 添加返回值OCR结果
-                    add_ocr = {}
-                    add_ocr['model_box'] = test_result1['bounding_box']
-                    add_ocr['label'] = test_result1['label']
-                    add_ocr['number'] = -1
-                    add_ocr['score'] = -1
-                    add_ocr['number_score'] = -1
-                    add_ocr['counts'] = -1
-                    add_ocr['ocr'] = test_result1['words']
-                    Score_last.append(add_ocr)
+                    choice_m_type_score_ocr = test_result1['words']
                 elif test['volume_structure'] == -1 and test['Score_structure'] == -1:
-                    ### 添加返回值OCR结果
-                    add_ocr = {}
-                    add_ocr['model_box'] = test_result1['bounding_box']
-                    add_ocr['label'] = test_result1['label']
-                    add_ocr['number'] = -1
-                    add_ocr['score'] = -1
-                    add_ocr['number_score'] = -1
-                    add_ocr['counts'] = -1
-                    add_ocr['ocr'] = test_result1['words']
-                    Score_last.append(add_ocr)
+                    choice_m_type_score_ocr = test_result1['words']
                 else:
                     if test['volume_structure'] != -1 and test['volume_structure'][0]['volume_score'] != -1:
                         choice_m_score = test['volume_structure'][0]['volume_score']
@@ -937,8 +908,14 @@ def get_sheet_number_total(answer_sheet, res, img0):
                         if answer_sheet['regions'][j]['class_name'] == 'choice_m':
                             answer_sheet['regions'][j]['default_points'] = len(
                                 answer_sheet['regions'][j]['number']) * [float(choice_m_score)]
+                elif test_result1['words'] != {}:
+                    for j in range(len(answer_sheet['regions'])):
+                        if answer_sheet['regions'][j]['class_name'] == 'choice_m':
+                            answer_sheet['regions'][j]['type_score_ocr'] = choice_m_type_score_ocr
+
 
         '''分数与模型对应'''
+        ocr_flag = 0
         for i in range(len(answer_sheet['regions'])):
             for j in range(len(Score_last)):
                 if (Score_last[j]['model_box'][0] == answer_sheet['regions'][i]['bounding_box']['xmin']
@@ -959,8 +936,11 @@ def get_sheet_number_total(answer_sheet, res, img0):
                                 answer_sheet['regions'][i]['class_name'] == 'solve' or answer_sheet['regions'][i][
                             'class_name'] == 'solve0'):
                             answer_sheet['regions'][i]['class_name'] = 'optional_solve'
+                        ocr_flag = 1
+                        if 'type_score_ocr' in answer_sheet['regions'][i].keys():
+                            del answer_sheet['regions'][i]['type_score_ocr']
                         # answer_sheet['regions'][i]['number_score'] = Score_last[j]['number_score']  # 小题分数
                         # answer_sheet['regions'][i]['counts'] = Score_last[j]['counts']  # 小题个数
-                    if 'ocr' in Score_last[j]:  # 没有识别到分数的模块添加type_score_ocr结果
+                    if ocr_flag == 0 and 'ocr' in Score_last[j]:  # 没有识别到分数的模块添加type_score_ocr结果
                         answer_sheet['regions'][i]['type_score_ocr'] = Score_last[j]['ocr']
     return answer_sheet

+ 82 - 0
segment/sheet_resolve/analysis/sheet/tag_parse.py

@@ -0,0 +1,82 @@
+
+# -*- coding:utf-8 -*-
+import CRFPP
+import re
+
+
+class TagParse:
+    def __init__(self, tagger):
+        self.tagger = tagger
+
+    def get_type(self, j):
+        if re.match(r'\d', j):
+            return 'num'
+        elif j in '一二三四五六七八九十':
+            return 'cn'
+        elif re.match(u'[\u4e00-\u9fa5]', j):
+            return 'ch'
+        else:
+            return 'b'
+
+    def sequence_init(self, sequence):
+        self.tagger.clear()
+        for word in sequence.strip():
+            word = word.strip()
+            if word:
+                word = word + '\t' + self.get_type(word)
+                self.tagger.add(word + "\t")
+        self.tagger.parse()
+
+    def get_val(self, sequence):
+        """
+        得到句子的标签
+        :param sequence: 一个句子,str
+        :return: x_res:句子中一个个字符,list;y_res:每个字符对应的预测标签,list
+        """
+        self.sequence_init(sequence)
+        size = self.tagger.size()
+
+        x_res = []
+        y_res = []
+        for i in range(0, size):
+            x_res.append(self.tagger.x(i, 0))
+            y_res.append(self.tagger.y2(i))
+        return x_res, y_res
+
+    def get_tag_val(self, sequence):
+        """
+        按标签将句子分块
+        :param sequence:一个句子,str
+        :return:[["xxx1",标签1],["xxxx2","标签2"],["xxx3",""标签3].....]
+        """
+        char_list, tag_list = self.get_val(sequence)
+        c = [0]
+        c1 = []
+
+        for i in range(len(tag_list) - 1):
+            if tag_list[i] != tag_list[i + 1]:
+                c.append(i + 1)
+                c1.append(i + 1)
+        c1.append(len(sequence))
+        res = []
+        for i, j in zip(c, c1):
+            res.append([''.join(char_list[i:j]), tag_list[i]])
+            # char_list1 = char_list[i:j]
+            # char_list_new = [''.join(char_list1)]
+            # tag_list1 = tag_list[i]
+            # res_dict = dict(map(lambda x, y: [x, y], tag_list1, char_list_new))
+            # res.append(res_dict)
+
+        return res
+
+
+if __name__ == '__main__':
+    taggers = CRFPP.Tagger("-m " + './model/crf2.model')
+    tb = TagParse(taggers)
+    # print(tb.get_tag_val('二、非选择题(共160分。36-42为必考题,43-47为选考题。)'))
+    # print(tb.get_tag_val('一、选择题〈1-20每题1.5分,21-30每题2分,共50分'))
+    print(tb.get_tag_val('任务型阅读。(共5小题,每小题2分,计10分)阅读短文,并按照要求完成66~70题'))
+    # print(tb.get_tag_val('一、选择题(共25小题,每小题2分,共50分'))
+    # print(tb.get_tag_val('27.(按要求给分,共12分)'))
+    # print(tb.get_tag_val('53(10分)'))
+    # print(tb.get_tag_val('一、选择题(每小题4分,共计40分。1至6小题单选,7至10小题多选,全对得4分,选对但不全得2分,不选或选错不得分)'))

+ 10 - 2
segment/sheet_server.py

@@ -26,6 +26,8 @@ from segment.sheet_resolve.analysis.sheet.sheet_points import get_sheet_points
 from segment.sheet_resolve.analysis.sheet.sheet_points_total import get_sheet_number_total
 from segment.sheet_resolve.tools import utils
 from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate, change_format_baidu_to_google
+from segment.sheet_resolve.analysis.sheet.sheet_points_by_nlp import get_sheet_points_by_nlp
+
 
 logger = logging.getLogger(settings.LOGGING_TYPE)
 
@@ -74,7 +76,7 @@ def convert_pil_to_jpeg(raw_img):
         img.paste(raw_img, mask=raw_img.split()[3])  # 3 is the alpha channel
     else:
         img = raw_img
-    open_cv_image = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR)
+    open_cv_image = np.array(img)
     return img, open_cv_image
 
 
@@ -351,7 +353,13 @@ def sheet_points(sheet_dict_list, image_list, ocr_list, if_ocr=False):
         for index, ele in enumerate(sheet_dict_list):
             ocr_res = change_format_baidu_to_google(ocr_list[index])
             sheet_dict = get_sheet_number_total(ele, ocr_res, image_list[index])
-            sheet_total_list.append(sheet_dict)
+            regions_list = sheet_dict['regions']
+            type_score_ocr = [ele for ele in regions_list if 'type_score_ocr' in ele]
+            if len(type_score_ocr) == 0:
+                sheet_total_list.append(sheet_dict)
+            else:
+                sheet_dict0 = get_sheet_points_by_nlp(sheet_dict)
+                sheet_total_list.append(sheet_dict0)
     except Exception as e:
         traceback.print_exc()
         sheet_total_list = sheet_dict_list

Некоторые файлы не были показаны из-за большого количества измененных файлов