123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- import re
- import shutil
- import glob
- from pprint import pprint
- import segment.ocr.luo_ocr.ocr as luo_ocr
- # from pypinyin import lazy_pinyin
- from segment.ocr.split_topic_en import topic_type_line
- # def to_pinyin_camel(s):
- # '''文件123.txt'''
- # py_ls = lazy_pinyin(s)
- # py_camel = [py.capitalize() for py in py_ls]
- # return "".join(py_camel)
- #
- #
- # def rename_filename(filename):
- # "将文件名转变为拼音"
- # filename_en = to_pinyin_camel(filename)
- # try:
- # shutil.copy(filename, filename_en)
- # except shutil.SameFileError:
- # pass
- # return filename_en
- # def request_ocr(filename):
- # '''中文无法上传需要修改成英文'''
- # url = "http://117.50.17.141/ocr"
- # data = {}
- # filename = rename_filename(filename)
- # files = {"mydata": open(filename, "rb")}
- # r = requests.post(url, data, files=files)
- # print(filename)
- # print(r.json())
- # return r.json()['text']
- topic_start = re.compile("^\s*(\d+)\s*[\.、::,,]")
- topic_start2 = re.compile("^\s*[(<〈《]?(\d+)\)\s*[\.、::,,]?")
- def is_topic_start(s, subject):
- """开始节点"""
- if subject in ['math', 'english', 'chinese', 'physics', 'chemistry', 'biology']:
- if topic_start.match(s):
- return True
- elif subject == 'math':
- if topic_start2.match(s):
- return True
- return False
- else:
- raise ValueError("subject={} is not supported!".format(subject))
- # -------------------------符合下列条件的则为结束-------------------------
- topic_end = re.compile("D\s*[\.、::]")
- topic_end2 = re.compile("^\s*G\s*[\.、::]")
- def is_topic_end(s, subject):
- """结束节点"""
- if subject in ['math', 'english', 'chinese', 'physics', 'chemistry', 'biology']:
- if topic_end.search(s):
- return True
- elif subject == 'english':
- if topic_end2.match(s):
- return True
- return False
- else:
- raise ValueError("subject={} is not supported!".format(subject))
- # -------------------------符合下列条件的则为跳过舍去-------------------------
- topic_filter = re.compile("^\s*[一二三四五六七八九十]+\s*[\.、::]")
- general_filter = ['选择题', '单选题', '多选题',
- '填空题', '单空题', '多空题',
- '解答题', '简答题', '证明题',
- '选做题', '实验题', '第II卷',
- '第I卷', '第二卷', ]
- english_filter = [["听", "材料"], ["听", "对话"], ["听", "独白"],
- ['第一节'], ['第二节'], ['语言知识运用'], ['第II卷'],
- ['第二部分'], ['第三部分'], ['第四部分']]
- chinese_filter = re.compile("^\s*[((]\s*[一二三四五六七八九十]\s*[))]\s*[\.、::]?")
- def contains_all(s, words):
- for word in words:
- if all([w in s for w in word]):
- return True
- return False
- def is_topic_skip(s, subject):
- """判断该行是否可以去掉,跳跃节点"""
- if subject == 'english':
- return contains_all(s, english_filter)
- elif subject in ['math', 'chinese', 'physics', 'chemistry', 'biology']:
- if topic_filter.match(s):
- return True
- if subject == 'chinese':
- if chinese_filter.match(s):
- return True
- for topic_type in general_filter:
- if topic_type in s:
- return True
- return False
- else:
- raise ValueError("subject={} is not supported!".format(subject))
- # ----------------------action----------------------
- def group_pictures(pictures, subject=''):
- """Assume pictures are row based"""
- # texts = []
- # for picture in pictures:
- # t = luo_ocr.ocr_py(picture)
- # t = t.replace("\r", "").replace("\n", "")
- # # print(str(t))
- # texts.append(str(t))
- # # texts = bd_ocr(pictures)
- # # print(texts)
- texts = [luo_ocr.ocr_py(picture).replace("\r", "").replace("\n", "") for picture in pictures]
- groups = []
- start = 0
- for i, t in enumerate(texts):
- if is_topic_start(t, subject):
- groups.append([start, i])
- start = i
- elif is_topic_end(t, subject):
- groups.append([start, i + 1])
- start = i + 1
- elif is_topic_skip(t, subject):
- if i > start:
- groups.append([start, i])
- start = i + 1
- len_text = len(texts)
- if len_text > start:
- groups.append([start, len_text])
- return texts, groups
|