import re import shutil import glob from pprint import pprint import segment.ocr.luo_ocr.ocr as luo_ocr # from pypinyin import lazy_pinyin from segment.ocr.split_topic_en import topic_type_line # def to_pinyin_camel(s): # '''文件123.txt''' # py_ls = lazy_pinyin(s) # py_camel = [py.capitalize() for py in py_ls] # return "".join(py_camel) # # # def rename_filename(filename): # "将文件名转变为拼音" # filename_en = to_pinyin_camel(filename) # try: # shutil.copy(filename, filename_en) # except shutil.SameFileError: # pass # return filename_en # def request_ocr(filename): # '''中文无法上传需要修改成英文''' # url = "http://117.50.17.141/ocr" # data = {} # filename = rename_filename(filename) # files = {"mydata": open(filename, "rb")} # r = requests.post(url, data, files=files) # print(filename) # print(r.json()) # return r.json()['text'] topic_start = re.compile("^\s*(\d+)\s*[\.、::,,]") topic_start2 = re.compile("^\s*[(<〈《]?(\d+)\)\s*[\.、::,,]?") def is_topic_start(s, subject): """开始节点""" if subject in ['math', 'english', 'chinese', 'physics', 'chemistry', 'biology']: if topic_start.match(s): return True elif subject == 'math': if topic_start2.match(s): return True return False else: raise ValueError("subject={} is not supported!".format(subject)) # -------------------------符合下列条件的则为结束------------------------- topic_end = re.compile("D\s*[\.、::]") topic_end2 = re.compile("^\s*G\s*[\.、::]") def is_topic_end(s, subject): """结束节点""" if subject in ['math', 'english', 'chinese', 'physics', 'chemistry', 'biology']: if topic_end.search(s): return True elif subject == 'english': if topic_end2.match(s): return True return False else: raise ValueError("subject={} is not supported!".format(subject)) # -------------------------符合下列条件的则为跳过舍去------------------------- topic_filter = re.compile("^\s*[一二三四五六七八九十]+\s*[\.、::]") general_filter = ['选择题', '单选题', '多选题', '填空题', '单空题', '多空题', '解答题', '简答题', '证明题', '选做题', '实验题', '第II卷', '第I卷', '第二卷', ] english_filter = [["听", "材料"], ["听", "对话"], ["听", "独白"], ['第一节'], ['第二节'], ['语言知识运用'], ['第II卷'], ['第二部分'], ['第三部分'], ['第四部分']] chinese_filter = re.compile("^\s*[((]\s*[一二三四五六七八九十]\s*[))]\s*[\.、::]?") def contains_all(s, words): for word in words: if all([w in s for w in word]): return True return False def is_topic_skip(s, subject): """判断该行是否可以去掉,跳跃节点""" if subject == 'english': return contains_all(s, english_filter) elif subject in ['math', 'chinese', 'physics', 'chemistry', 'biology']: if topic_filter.match(s): return True if subject == 'chinese': if chinese_filter.match(s): return True for topic_type in general_filter: if topic_type in s: return True return False else: raise ValueError("subject={} is not supported!".format(subject)) # ----------------------action---------------------- def group_pictures(pictures, subject=''): """Assume pictures are row based""" # texts = [] # for picture in pictures: # t = luo_ocr.ocr_py(picture) # t = t.replace("\r", "").replace("\n", "") # # print(str(t)) # texts.append(str(t)) # # texts = bd_ocr(pictures) # # print(texts) texts = [luo_ocr.ocr_py(picture).replace("\r", "").replace("\n", "") for picture in pictures] groups = [] start = 0 for i, t in enumerate(texts): if is_topic_start(t, subject): groups.append([start, i]) start = i elif is_topic_end(t, subject): groups.append([start, i + 1]) start = i + 1 elif is_topic_skip(t, subject): if i > start: groups.append([start, i]) start = i + 1 len_text = len(texts) if len_text > start: groups.append([start, len_text]) return texts, groups