server.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807
  1. import base64
  2. import glob
  3. import os
  4. import time
  5. import uuid
  6. import shutil
  7. import xml.etree.cElementTree as ET
  8. from urllib import parse, request
  9. from io import BytesIO
  10. import cv2
  11. import numpy as np
  12. import pypinyin
  13. import requests
  14. from PIL import Image
  15. from django.conf import settings
  16. from pdf2image import convert_from_path
  17. import segment.logging_config as logging
  18. from segment.image_operation.exam_segment import get_page_text
  19. from segment.image_operation.pre_segment import segment2parts
  20. from segment.image_operation.segment import joint_image
  21. from segment.image_operation.split_lines import line_split
  22. from segment.image_operation.utils import create_xml, resize_by_percent
  23. from segment.image_operation.utils import write_single_img
  24. from segment.models import OcrToken
  25. from segment.ocr.group_pictures import group_pictures
  26. from segment.ocr.group_text import group_text
  27. from segment.ocr.penguin_ocr import get_ocr_english_text
  28. logger = logging.getLogger(settings.LOGGING_TYPE)
  29. def convert_pil_to_jpeg(raw_img):
  30. if raw_img.mode == 'L':
  31. channels = raw_img.split()
  32. img = Image.merge("RGB", (channels[0], channels[0], channels[0]))
  33. elif raw_img.mode == 'RGB':
  34. img = raw_img
  35. elif raw_img.mode == 'RGBA':
  36. img = Image.new("RGB", raw_img.size, (255, 255, 255))
  37. img.paste(raw_img, mask=raw_img.split()[3]) # 3 is the alpha channel
  38. else:
  39. img = raw_img
  40. open_cv_image = np.array(img)
  41. return img, open_cv_image
  42. def opencv2base64(image, to_pil=True):
  43. # image = cv2.imencode('.jpg', img)[1]
  44. # base64_data = str(base64.b64encode(image))[2:-1]
  45. if to_pil:
  46. image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
  47. output_buffer = BytesIO()
  48. image.save(output_buffer, format='JPEG')
  49. byte_data = output_buffer.getvalue()
  50. base64_data = base64.b64encode(byte_data)
  51. return base64_data
  52. def get_dir_next_index_name(path, file_type):
  53. files_list = os.listdir(path)
  54. imgs_list = [file.replace(file_type, '') for file in files_list if file.endswith(file_type)]
  55. length = len(imgs_list)
  56. if length == 0:
  57. return 1
  58. else:
  59. index_name = max(imgs_list)
  60. return int(index_name) + 1
  61. def save_raw_image(subject, datetime, img_file, analysis_type):
  62. # 随机生成新的图片名,自定义路径。
  63. ext = img_file.name.split('.')[-1]
  64. raw_name = img_file.name[0:-len(ext) - 1]
  65. file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], 'jpg')
  66. raw_img = Image.open(img_file) # 读取上传的网络图像
  67. save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
  68. if not os.path.exists(save_dir):
  69. os.makedirs(save_dir)
  70. save_path = os.path.join(save_dir, file_name)
  71. channels = raw_img.split()
  72. if len(channels) >= 3:
  73. img = Image.merge("RGB", (channels[0], channels[1], channels[2]))
  74. open_cv_image = np.array(img)
  75. img_reload = open_cv_image[:, :, ::-1].copy()
  76. parts_list = segment2parts(img_reload, save_path)
  77. else:
  78. img = raw_img
  79. open_cv_image = np.array(img)
  80. parts_list = segment2parts(open_cv_image, save_path)
  81. # for part in parts_list:
  82. # with open(part['img_part'], 'rb') as f:
  83. # bin_img = f.read()
  84. # part['img_part'] = bin_img
  85. try:
  86. img.save(save_path)
  87. except Exception as e:
  88. raise e
  89. url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
  90. return save_path, parts_list, url_path
  91. def save_raw_image_without_segment(subject, datetime, img_file, analysis_type):
  92. # 随机生成新的图片名,自定义路径。
  93. ext = img_file.name.split('.')[-1]
  94. raw_name = img_file.name[0:-len(ext) - 1]
  95. file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], 'jpg')
  96. raw_img = Image.open(img_file) # 读取上传的网络图像
  97. save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
  98. if not os.path.exists(save_dir):
  99. os.makedirs(save_dir)
  100. save_path = os.path.join(save_dir, file_name)
  101. pil_img, open_cv_image = convert_pil_to_jpeg(raw_img)
  102. try:
  103. pil_img.save(save_path)
  104. shutil.copy(save_path, save_path.replace('.jpg', '_small.jpg'))
  105. except Exception as e:
  106. raise e
  107. url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
  108. return save_path, open_cv_image, url_path
  109. def save_raw_image_without_segment_formula(subject, datetime, img_file, analysis_type):
  110. # 随机生成新的图片名,自定义路径。
  111. ext = img_file.name.split('.')[-1]
  112. raw_name = img_file.name[0:-len(ext) - 1]
  113. file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], ext)
  114. raw_img = Image.open(img_file) # 读取上传的网络图像
  115. save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
  116. if not os.path.exists(save_dir):
  117. os.makedirs(save_dir)
  118. save_path = os.path.join(save_dir, file_name)
  119. channels = raw_img.split()
  120. # if ext == 'png' and len(channels) >= 3: # 公式ocr分割透明png
  121. # img = Image.merge("RGB", (channels[0], channels[1], channels[2]))
  122. # open_cv_image = np.array(img)
  123. # resize_img = resize_by_percent(open_cv_image, 0.5)
  124. #
  125. # else:
  126. # img = raw_img
  127. # open_cv_image = np.array(img)
  128. # resize_img = resize_by_percent(open_cv_image, 0.5)
  129. try:
  130. raw_img.save(save_path)
  131. # write_single_img(resize_img, save_path.replace('.jpg', '_small.jpg'))
  132. except Exception as e:
  133. raise e
  134. url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
  135. return save_path, url_path, raw_img
  136. def save_raw_image_in_jpeg(subject, datetime, img_file, analysis_type):
  137. # 随机生成新的图片名,自定义路径。
  138. ext = img_file.name.split('.')[-1]
  139. raw_name = img_file.name[0:-len(ext) - 1]
  140. file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], 'jpg')
  141. raw_img = Image.open(img_file) # 读取上传的网络图像
  142. save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
  143. if not os.path.exists(save_dir):
  144. os.makedirs(save_dir)
  145. save_path = os.path.join(save_dir, file_name)
  146. if raw_img.mode == 'L':
  147. channels = raw_img.split()
  148. img = Image.merge("RGB", (channels[0], channels[0], channels[0]))
  149. elif raw_img.mode == 'RGB':
  150. img = raw_img
  151. elif raw_img.mode == 'RGBA':
  152. img = Image.new("RGB", raw_img.size, (255, 255, 255))
  153. img.paste(raw_img, mask=raw_img.split()[3]) # 3 is the alpha channel
  154. else:
  155. img = raw_img
  156. open_cv_image = np.array(img)
  157. # resize_img = resize_by_percent(open_cv_image, 0.5)
  158. try:
  159. img.save(save_path)
  160. # write_single_img(resize_img, save_path.replace('.jpg', '_small.jpg'))
  161. except Exception as e:
  162. raise e
  163. url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
  164. return save_path, url_path, open_cv_image
  165. def ocr_login():
  166. def login():
  167. grant_type = 'client_credentials'
  168. client_id = settings.OCR_CLIENT_ID
  169. client_secret = settings.OCR_CLIENT_SECRET
  170. textmod = {'grant_type': grant_type, 'client_id': client_id, 'client_secret': client_secret}
  171. textmod = parse.urlencode(textmod)
  172. # 输出内容:user=admin&password=admin
  173. header_dict = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'}
  174. url = 'https://aip.baidubce.com/oauth/2.0/token'
  175. req = request.Request(url='{}{}{}'.format(url, '?', textmod), headers=header_dict)
  176. res = request.urlopen(req).read()
  177. token = eval(res.decode(encoding='utf-8'))['access_token']
  178. lastest_access_token = OcrToken(access_token=token)
  179. lastest_access_token.save()
  180. return token
  181. objects = OcrToken.objects.latest('update_time')
  182. lastest_access_token_db = objects.access_token
  183. lastest_date = objects.update_time
  184. ans_time = time.mktime(lastest_date.timetuple())
  185. update_date = settings.OCR_TOKEN_UPDATE_DATE
  186. current_time = time.time()
  187. if (ans_time + update_date * 24 * 60 * 60) > current_time:
  188. return lastest_access_token_db
  189. else:
  190. return login()
  191. def get_exam_bbox_by_tesseract(img_raw_name, img_path, subject):
  192. error_info = ''
  193. status = 1
  194. text = []
  195. lines_save_dir = img_path.replace('.jpg', '_lines')
  196. img_path = os.path.abspath(img_path)
  197. lines_save_dir = os.path.abspath(lines_save_dir)
  198. if not os.path.exists(lines_save_dir):
  199. os.makedirs(lines_save_dir)
  200. start_time = time.time()
  201. try:
  202. bbox, lines_abs_path_list = line_split(img_path, lines_save_dir, settings.TOLERANCE_PIX_NUMBER) # 分行
  203. except Exception as e:
  204. logger.error('line_split failed: {}'.format(e), exc_info=True)
  205. status = 0
  206. error_info = str(e)
  207. info = {'is_success': status, 'img_name': img_raw_name, 'coordinate': text, 'error': error_info}
  208. return info
  209. time1 = time.time()
  210. logger.info('lines_segment, cost: {}'.format(time1 - start_time))
  211. exam_group = []
  212. try:
  213. _, exam_group = group_pictures(lines_abs_path_list, subject)
  214. logger.info('exam_group info : {}'.format(exam_group))
  215. except (SystemExit, KeyboardInterrupt):
  216. raise
  217. except Exception as e:
  218. logger.error('ocr failed: {}'.format(e), exc_info=True)
  219. status = 0
  220. error_info = error_info + str(e)
  221. time2 = time.time()
  222. logger.info('exam_grouped, cost: {}'.format(time2 - time1))
  223. try:
  224. text = joint_image(img_path, bbox, exam_group)
  225. except (SystemExit, KeyboardInterrupt):
  226. raise
  227. except Exception as e:
  228. logger.error('generate coordinate info failed: {}'.format(e), exc_info=True)
  229. status = 0
  230. error_info = error_info + str(e)
  231. info = {'img_name': img_raw_name, 'coordinate': text}
  232. if error_info:
  233. info = {'img_name': img_raw_name, 'coordinate': text, 'error': error_info}
  234. logger.info('{} done'.format(img_raw_name))
  235. return status, info
  236. def get_ocr_text(access_token, img, subject=None):
  237. textmod = {'access_token': access_token}
  238. textmod = parse.urlencode(textmod)
  239. url = '{}{}{}{}{}'.format(settings.OCR_URL, settings.OCR_ACCURACY, '_basic', '?', textmod)
  240. url_general = '{}{}{}{}{}'.format(settings.OCR_URL, 'general', '_basic', '?', textmod)
  241. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  242. # image = opecv2base64(img) # 得到 byte 编码的数据
  243. image = img
  244. data = {
  245. 'image': image,
  246. 'detect_direction': 'true',
  247. 'language_type': 'CHN_ENG',
  248. }
  249. if subject == 'english':
  250. resp = requests.post(url, data=data, headers=headers).json()
  251. else:
  252. resp = requests.post(url, data=data, headers=headers).json()
  253. if resp.get('error_msg'):
  254. if 'internal error' in resp.get('error_msg'):
  255. resp = requests.post(url_general, data=data, headers=headers).json()
  256. if resp.get('error_msg'):
  257. raise Exception("ocr {}!".format(resp.get('error_msg')))
  258. else:
  259. raise Exception("ocr {}!".format(resp.get('error_msg')))
  260. words_result = resp.get('words_result')
  261. text_list = [word.get('words') for word in words_result]
  262. # words_list = {'word': text_list, 'subject': subject}
  263. return text_list
  264. def get_ocr_text_and_coordinate_in_raw_format(access_token, img):
  265. textmod = {'access_token': access_token}
  266. textmod = parse.urlencode(textmod)
  267. url = '{}{}{}{}'.format(settings.OCR_BOX_URL, settings.OCR_ACCURACY, '?', textmod)
  268. url_general = '{}{}{}{}'.format(settings.OCR_BOX_URL, 'general', '?', textmod)
  269. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  270. image_type = 'base64'
  271. group_id = 'group001'
  272. user_id = 'usr001'
  273. # image = base64.b64encode(img) # 得到 byte 编码的数据
  274. image = img
  275. data = {
  276. 'image_type': image_type,
  277. 'group_id': group_id,
  278. 'user_id': user_id,
  279. 'image': image,
  280. 'detect_direction': 'true',
  281. 'recognize_granularity': 'small',
  282. # 'vertexes_location': 'true',
  283. # 'probability': 'true'
  284. }
  285. resp = requests.post(url, data=data, headers=headers).json()
  286. if resp.get('error_msg'):
  287. if 'internal error' in resp.get('error_msg'):
  288. resp = requests.post(url_general, data=data, headers=headers).json()
  289. if resp.get('error_msg'):
  290. raise Exception("ocr {}!".format(resp.get('error_msg')))
  291. else:
  292. raise Exception("ocr {}!".format(resp.get('error_msg')))
  293. return resp
  294. def get_ocr_text_and_coordinate(access_token, img):
  295. textmod = {'access_token': access_token}
  296. textmod = parse.urlencode(textmod)
  297. url = '{}{}{}{}'.format(settings.OCR_BOX_URL, settings.OCR_ACCURACY, '?', textmod)
  298. url_general = '{}{}{}{}'.format(settings.OCR_BOX_URL, 'general', '?', textmod)
  299. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  300. image_type = 'base64'
  301. group_id = 'group001'
  302. user_id = 'usr001'
  303. # image = base64.b64encode(img) # 得到 byte 编码的数据
  304. image = img
  305. data = {
  306. 'image_type': image_type,
  307. 'group_id': group_id,
  308. 'user_id': user_id,
  309. 'image': image,
  310. 'detect_direction': 'true',
  311. # 'recognize_granularity': 'small',
  312. # 'vertexes_location': 'true',
  313. # 'probability': 'true'
  314. }
  315. resp = requests.post(url, data=data, headers=headers).json()
  316. if resp.get('error_msg'):
  317. if 'internal error' in resp.get('error_msg'):
  318. resp = requests.post(url_general, data=data, headers=headers).json()
  319. if resp.get('error_msg'):
  320. raise Exception("ocr {}!".format(resp.get('error_msg')))
  321. else:
  322. raise Exception("ocr {}!".format(resp.get('error_msg')))
  323. words_result = resp.get('words_result')
  324. text_list = [word.get('words') for word in words_result]
  325. # words_list = {'word': text_list, 'subject': subject}
  326. matrix_lt, matrix_rb = resolve_json(words_result)
  327. return text_list, matrix_lt, matrix_rb
  328. def get_ocr_text_and_coordinate_formula(img, access_token, base64=False):
  329. textmod = {'access_token': access_token}
  330. textmod = parse.urlencode(textmod)
  331. url = '{}{}{}{}'.format(settings.OCR_BOX_URL, settings.OCR_ACCURACY, '?', textmod)
  332. url_general = '{}{}{}{}'.format(settings.OCR_BOX_URL, 'general', '?', textmod)
  333. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  334. image_type = 'base64'
  335. group_id = 'group001'
  336. user_id = 'usr001'
  337. if base64:
  338. image = img
  339. else:
  340. image = opencv2base64(img)
  341. data = {
  342. 'image_type': image_type,
  343. 'group_id': group_id,
  344. 'user_id': user_id,
  345. 'image': image,
  346. 'detect_direction': 'true',
  347. 'recognize_granularity': 'small',
  348. 'language_type': 'CHN_ENG',
  349. # 'vertexes_location': 'true',
  350. # 'probability': 'true'
  351. }
  352. resp = requests.post(url, data=data, headers=headers).json()
  353. if resp.get('error_msg'):
  354. if 'internal error' in resp.get('error_msg'):
  355. resp = requests.post(url_general, data=data, headers=headers).json()
  356. if resp.get('error_msg'):
  357. raise Exception("ocr {}!".format(resp.get('error_msg')))
  358. else:
  359. raise Exception("ocr {}!".format(resp.get('error_msg')))
  360. words_result = resp.get('words_result')
  361. return words_result
  362. def resolve_json(words_result):
  363. box_list = [item[key] for item in words_result for key in item if key == 'location']
  364. matrix = np.array([0, 0, 0, 0])
  365. for box in box_list:
  366. # num_list = list(box.values())
  367. w = box.get('width')
  368. l = box.get('left')
  369. t = box.get('top')
  370. h = box.get('height')
  371. num_list = [w, t, l, h]
  372. matrix = np.vstack([matrix, np.array(num_list)])
  373. matrix = matrix[1:]
  374. matrix_w = matrix[:, 0:1]
  375. matrix_t = matrix[:, 1:2]
  376. matrix_l = matrix[:, 2:3]
  377. matrix_h = matrix[:, 3:]
  378. matrix_lt = np.hstack([matrix_l, matrix_t])
  379. matrix_wh = np.hstack([matrix_w, matrix_h])
  380. matrix_rb = matrix_lt + matrix_wh
  381. return matrix_lt, matrix_rb
  382. def group_to_coordinate(group_list, matrix_lt, matrix_rb):
  383. matrix_box_vlist = np.array([0, 0, 0, 0])
  384. for element in group_list:
  385. if element[0] < element[1]:
  386. rb = matrix_rb[element[0]:element[1]].max(axis=0)
  387. lt = matrix_lt[element[0]:element[1]].min(axis=0)
  388. matrix_box = np.hstack([lt, rb])
  389. matrix_box_vlist = np.vstack([matrix_box_vlist, matrix_box])
  390. matrix_box_vlist = matrix_box_vlist[1:]
  391. return matrix_box_vlist.tolist()
  392. def get_exam_box(img_raw_name, img_list, save_path, subject, access_token):
  393. status = 1
  394. error_info = ''
  395. box_list = []
  396. words_list_all = []
  397. group_list_all = []
  398. try:
  399. for img_part in img_list:
  400. x_bias = img_part['x_bias']
  401. y_bias = img_part['y_bias']
  402. img = img_part['img_part']
  403. words_list, matrix_lt, matrix_rb = get_ocr_text_and_coordinate(access_token, img)
  404. matrix_lt = matrix_lt + np.asarray([x_bias, y_bias])
  405. matrix_rb = matrix_rb + np.asarray([x_bias, y_bias])
  406. group_list = group_text(words_list, subject)
  407. part_box_list = group_to_coordinate(group_list, matrix_lt, matrix_rb)
  408. box_list = box_list + part_box_list
  409. words_list.append('********************************')
  410. words_list_all = words_list_all + words_list
  411. group_list_all.append(group_list)
  412. try:
  413. txt_backup_path = save_path.replace('.jpg', '.txt')
  414. words_list = [line + ',\n' for line in words_list_all]
  415. with open(txt_backup_path, 'w', encoding='utf-8') as writer:
  416. writer.writelines('subject:' + subject + '\n')
  417. writer.writelines('[\n')
  418. writer.writelines(words_list)
  419. writer.writelines(']\n')
  420. writer.writelines(str(group_list_all))
  421. logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
  422. except Exception as e:
  423. logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  424. # 记录xml坐标信息
  425. tree = ET.parse(r'./segment/exam_info/000000-template.xml') # xml tree
  426. for index_num, exam_bbox in enumerate(box_list):
  427. tree = create_xml('{:02d}'.format(index_num), tree,
  428. exam_bbox[0], exam_bbox[1], exam_bbox[2], exam_bbox[3])
  429. # print(exam_items_bbox)
  430. tree.write(save_path.replace('.jpg', '.xml'))
  431. except Exception as e:
  432. logger.error('{}试卷: {} 坐标生成失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  433. status = 0
  434. error_info = error_info + str(e)
  435. info = {'img_name': img_raw_name, 'coordinate': box_list}
  436. if error_info:
  437. info = {'img_name': img_raw_name, 'coordinate': box_list, 'error': error_info}
  438. logger.info('{} done'.format(img_raw_name))
  439. return status, info
  440. def get_exam_ocr(img_raw_name, img_list, save_path, subject, access_token):
  441. status = 1
  442. error_info = ''
  443. words_list = []
  444. for img_part in img_list:
  445. img = img_part['img_part']
  446. try:
  447. part_words_list = get_ocr_text(access_token, img, subject)
  448. except Exception as e:
  449. part_words_list = []
  450. error_info = error_info + str(e)
  451. words_list = words_list + part_words_list
  452. if len(words_list) < 1:
  453. logger.error('{}试卷: {} OCR解析失败: {}'.format(subject, img_raw_name, error_info), exc_info=True)
  454. status = 0
  455. else:
  456. try:
  457. txt_backup_path = save_path.replace('.jpg', '.txt')
  458. words_list = [line + '\n' for line in words_list]
  459. # # words_list.append(group_list)
  460. with open(txt_backup_path, 'w', encoding='utf-8') as writer:
  461. writer.writelines('subject:' + subject + '\n')
  462. writer.writelines('[\n')
  463. writer.writelines(words_list)
  464. writer.writelines(']\n')
  465. logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
  466. except Exception as e:
  467. logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  468. info = {'img_name': img_raw_name, 'text': words_list}
  469. if error_info:
  470. info = {'img_name': img_raw_name, 'text': words_list, 'error': error_info}
  471. logger.info('{} done'.format(img_raw_name))
  472. return status, info
  473. def get_exam_ocr_single(img_raw_name, img, save_path, subject, access_token):
  474. status = 1
  475. error_info = ''
  476. words_list = []
  477. try:
  478. part_words_list = get_ocr_text(access_token, img)
  479. except Exception as e:
  480. part_words_list = []
  481. error_info = error_info + str(e)
  482. words_list = words_list + part_words_list
  483. if len(words_list) < 1:
  484. logger.error('{}试卷: {} OCR解析失败: {}'.format(subject, img_raw_name, error_info), exc_info=True)
  485. status = 0
  486. else:
  487. try:
  488. txt_backup_path = save_path.replace('.jpg', '.txt')
  489. words_list = [line + ',\n' for line in words_list]
  490. # # words_list.append(group_list)
  491. with open(txt_backup_path, 'w', encoding='utf-8') as writer:
  492. writer.writelines('subject:' + subject + '\n')
  493. writer.writelines('[\n')
  494. writer.writelines(words_list)
  495. writer.writelines(']\n')
  496. logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
  497. except Exception as e:
  498. logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  499. info = {'img_name': img_raw_name, 'text': words_list}
  500. if error_info:
  501. info = {'img_name': img_raw_name, 'text': words_list, 'error': error_info}
  502. logger.info('{} done'.format(img_raw_name))
  503. return status, info
  504. def get_segment_by_ocr_once(opencv_img, token, subject, save_path, img_raw_name):
  505. img = opencv2base64(opencv_img)
  506. resp = get_ocr_text_and_coordinate_in_raw_format(token, img)
  507. if len(opencv_img.shape) == 3:
  508. opencv_img = cv2.cvtColor(opencv_img, cv2.COLOR_BGR2GRAY)
  509. test_list = get_page_text(resp['words_result'], opencv_img)
  510. status = 1
  511. error_info = ''
  512. box_list = []
  513. words_list_all = []
  514. group_list_all = []
  515. try:
  516. for one_page_text in test_list:
  517. words_list = [word.get('words') for word in one_page_text]
  518. matrix_lt, matrix_rb = resolve_json(one_page_text)
  519. group_list = group_text(words_list, subject)
  520. part_box_list = group_to_coordinate(group_list, matrix_lt, matrix_rb)
  521. box_list = box_list + part_box_list
  522. words_list.append('********************************')
  523. words_list_all = words_list_all + words_list
  524. group_list_all.append(group_list)
  525. try:
  526. txt_backup_path = save_path.replace('.jpg', '.txt')
  527. words_list = [line + '\n' for line in words_list_all]
  528. with open(txt_backup_path, 'w', encoding='utf-8') as writer:
  529. writer.writelines('subject:' + subject + '\n')
  530. writer.writelines('[\n')
  531. writer.writelines(words_list)
  532. writer.writelines(']\n')
  533. writer.writelines(str(group_list_all))
  534. logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
  535. except Exception as e:
  536. logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  537. # 记录xml坐标信息
  538. tree = ET.parse(r'./segment/exam_info/000000-template.xml') # xml tree
  539. for index_num, exam_bbox in enumerate(box_list):
  540. tree = create_xml('{:02d}'.format(index_num), tree,
  541. exam_bbox[0], exam_bbox[1], exam_bbox[2], exam_bbox[3])
  542. # print(exam_items_bbox)
  543. tree.write(save_path.replace('.jpg', '.xml'))
  544. except Exception as e:
  545. logger.error('{}试卷: {} 坐标生成失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  546. status = 0
  547. error_info = error_info + str(e)
  548. info = {'img_name': img_raw_name, 'coordinate': box_list}
  549. if error_info:
  550. info = {'img_name': img_raw_name, 'coordinate': box_list, 'error': error_info}
  551. logger.info('{} done'.format(img_raw_name))
  552. return status, info
  553. # opencv_img, token, subject, save_path, img_raw_name
  554. def get_exam_ocr_once(opencv_img, token, subject, save_path, img_raw_name):
  555. img = opencv2base64(opencv_img)
  556. resp = get_ocr_text_and_coordinate_in_raw_format(token, img)
  557. if len(opencv_img.shape) == 3:
  558. opencv_img = cv2.cvtColor(opencv_img, cv2.COLOR_BGR2GRAY)
  559. test_list = get_page_text(resp['words_result'], opencv_img)
  560. words_list = []
  561. for one_page_raw_text in test_list:
  562. one_page_words_list = [word.get('words') for word in one_page_raw_text]
  563. words_list = words_list + one_page_words_list
  564. status = 1
  565. error_info = ''
  566. if len(words_list) < 1:
  567. logger.error('{}试卷: {} OCR解析失败: {}'.format(subject, img_raw_name, error_info), exc_info=True)
  568. status = 0
  569. else:
  570. try:
  571. txt_backup_path = save_path.replace('.jpg', '.txt')
  572. words_list = [line + '\n' for line in words_list]
  573. # # words_list.append(group_list)
  574. with open(txt_backup_path, 'w', encoding='utf-8') as writer:
  575. writer.writelines('subject:' + subject + '\n')
  576. writer.writelines('[\n')
  577. writer.writelines(words_list)
  578. writer.writelines(']\n')
  579. logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
  580. except Exception as e:
  581. logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  582. info = {'img_name': img_raw_name, 'text': words_list}
  583. if error_info:
  584. info = {'img_name': img_raw_name, 'text': words_list, 'error': error_info}
  585. logger.info('{} done'.format(img_raw_name))
  586. return status, info
  587. def save_pdf_image(pdf_file, subject, time_str):
  588. name = pdf_file.name[:-4]
  589. ext0 = pdf_file.name.split('.')[-1]
  590. raw_name0 = ''.join([''.join(i) for i in pypinyin.pinyin(name, style=pypinyin.NORMAL)])
  591. save_dir0 = os.sep.join(
  592. [settings.MEDIA_ROOT, 'ocr', subject, time_str, raw_name0 + '_{}'.format(uuid.uuid4().hex[:10])])
  593. if not os.path.exists(save_dir0):
  594. os.makedirs(save_dir0)
  595. pdf_path = os.sep.join([save_dir0, raw_name0 + '.' + ext0])
  596. with open(pdf_path, 'wb') as pdfFileObj:
  597. for chunk in pdf_file.chunks():
  598. pdfFileObj.write(chunk)
  599. images_list = convert_from_path(pdf_path, dpi=200, output_folder=save_dir0,
  600. output_file='image',
  601. first_page=None, last_page=None, fmt='JPEG')
  602. upload_img_path_list = glob.glob(os.sep.join([save_dir0, '*.jpg']))
  603. try:
  604. images_list = [cv2.cvtColor(np.asarray(ele), cv2.COLOR_RGB2BGR) for ele in images_list]
  605. except Exception:
  606. images_list = [np.asarray(ele) for ele in images_list]
  607. return upload_img_path_list, images_list
  608. def save_raw_image_without_segment_pdf(subject, datetime, raw_name, img_file, analysis_type):
  609. # 随机生成新的图片名,自定义路径。
  610. file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], 'jpg')
  611. raw_img = Image.open(img_file) # 读取上传的网络图像
  612. save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
  613. if not os.path.exists(save_dir):
  614. os.makedirs(save_dir)
  615. save_path = os.path.join(save_dir, file_name)
  616. channels = raw_img.split()
  617. if len(channels) > 3:
  618. img = Image.merge("RGB", (channels[1], channels[2], channels[3]))
  619. open_cv_image = np.array(img)
  620. resize_img = resize_by_percent(open_cv_image, 0.5)
  621. else:
  622. img = raw_img
  623. open_cv_image = np.array(img)
  624. resize_img = resize_by_percent(open_cv_image, 0.5)
  625. try:
  626. img.save(save_path)
  627. # write_single_img(resize_img, save_path.replace('.jpg', '_small.jpg'))
  628. except Exception as e:
  629. raise e
  630. url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
  631. return save_path, url_path, open_cv_image
  632. def get_exam_ocr_by_penguin(img_raw_name, raw_image, size, save_path, subject):
  633. status = 1
  634. error_info = ''
  635. words_list = []
  636. try:
  637. words_list = get_ocr_english_text(raw_image, size)
  638. except Exception as e:
  639. error_info = error_info + str(e)
  640. if len(words_list) < 1:
  641. logger.error('{}试卷: {} OCR解析失败: {}'.format(subject, img_raw_name, error_info), exc_info=True)
  642. status = 0
  643. else:
  644. try:
  645. txt_backup_path = save_path.replace('.jpg', '.txt')
  646. words_list = [line + '\n' for line in words_list]
  647. # # words_list.append(group_list)
  648. with open(txt_backup_path, 'w', encoding='utf-8') as writer:
  649. writer.writelines('subject:' + subject + '\n')
  650. writer.writelines('[\n')
  651. writer.writelines(words_list)
  652. writer.writelines(']\n')
  653. logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
  654. except Exception as e:
  655. logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  656. info = {'img_name': img_raw_name, 'text': words_list}
  657. if error_info:
  658. info = {'img_name': img_raw_name, 'text': words_list, 'error': error_info}
  659. logger.info('{} done'.format(img_raw_name))
  660. return status, info