server.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811
  1. import base64
  2. import glob
  3. import os
  4. import time
  5. import uuid
  6. import shutil
  7. import xml.etree.cElementTree as ET
  8. from urllib import parse, request
  9. from io import BytesIO
  10. import cv2
  11. import numpy as np
  12. import pypinyin
  13. import requests
  14. from PIL import Image
  15. from django.conf import settings
  16. from pdf2image import convert_from_path
  17. import segment.logging_config as logging
  18. from segment.image_operation.exam_segment import get_page_text
  19. from segment.image_operation.pre_segment import segment2parts
  20. from segment.image_operation.segment import joint_image
  21. from segment.image_operation.split_lines import line_split
  22. from segment.image_operation.utils import create_xml, resize_by_percent
  23. from segment.image_operation.utils import write_single_img
  24. from segment.models import OcrToken
  25. from segment.ocr.group_pictures import group_pictures
  26. from segment.ocr.group_text import group_text
  27. from segment.ocr.penguin_ocr import get_ocr_english_text
  28. logger = logging.getLogger(settings.LOGGING_TYPE)
  29. def convert_pil_to_jpeg(raw_img):
  30. if raw_img.mode == 'L':
  31. channels = raw_img.split()
  32. img = Image.merge("RGB", (channels[0], channels[0], channels[0]))
  33. elif raw_img.mode == 'RGB':
  34. img = raw_img
  35. elif raw_img.mode == 'RGBA':
  36. img = Image.new("RGB", raw_img.size, (255, 255, 255))
  37. img.paste(raw_img, mask=raw_img.split()[3]) # 3 is the alpha channel
  38. else:
  39. img = raw_img
  40. open_cv_image = np.array(img)
  41. return img, open_cv_image
  42. def opencv2base64(image, to_pil=False):
  43. # image = cv2.imencode('.jpg', img)[1]
  44. # base64_data = str(base64.b64encode(image))[2:-1]
  45. if to_pil:
  46. image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
  47. output_buffer = BytesIO()
  48. image.save(output_buffer, format='JPEG')
  49. byte_data = output_buffer.getvalue()
  50. base64_data = base64.b64encode(byte_data)
  51. else:
  52. data = cv2.imencode('.jpg', image)[1]
  53. base64_data = base64.b64encode(data.tostring()).decode('utf8')
  54. return base64_data
  55. def get_dir_next_index_name(path, file_type):
  56. files_list = os.listdir(path)
  57. imgs_list = [file.replace(file_type, '') for file in files_list if file.endswith(file_type)]
  58. length = len(imgs_list)
  59. if length == 0:
  60. return 1
  61. else:
  62. index_name = max(imgs_list)
  63. return int(index_name) + 1
  64. def save_raw_image(subject, datetime, img_file, analysis_type):
  65. # 随机生成新的图片名,自定义路径。
  66. ext = img_file.name.split('.')[-1]
  67. raw_name = img_file.name[0:-len(ext) - 1]
  68. file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], 'jpg')
  69. raw_img = Image.open(img_file) # 读取上传的网络图像
  70. save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
  71. if not os.path.exists(save_dir):
  72. os.makedirs(save_dir)
  73. save_path = os.path.join(save_dir, file_name)
  74. channels = raw_img.split()
  75. if len(channels) >= 3:
  76. img = Image.merge("RGB", (channels[0], channels[1], channels[2]))
  77. open_cv_image = np.array(img)
  78. img_reload = open_cv_image[:, :, ::-1].copy()
  79. parts_list = segment2parts(img_reload, save_path)
  80. else:
  81. img = raw_img
  82. open_cv_image = np.array(img)
  83. parts_list = segment2parts(open_cv_image, save_path)
  84. # for part in parts_list:
  85. # with open(part['img_part'], 'rb') as f:
  86. # bin_img = f.read()
  87. # part['img_part'] = bin_img
  88. try:
  89. img.save(save_path)
  90. except Exception as e:
  91. raise e
  92. url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
  93. return save_path, parts_list, url_path
  94. def save_raw_image_without_segment(subject, datetime, img_file, analysis_type):
  95. # 随机生成新的图片名,自定义路径。
  96. ext = img_file.name.split('.')[-1]
  97. raw_name = img_file.name[0:-len(ext) - 1]
  98. file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], 'jpg')
  99. raw_img = Image.open(img_file) # 读取上传的网络图像
  100. save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
  101. if not os.path.exists(save_dir):
  102. os.makedirs(save_dir)
  103. save_path = os.path.join(save_dir, file_name)
  104. pil_img, open_cv_image = convert_pil_to_jpeg(raw_img)
  105. try:
  106. pil_img.save(save_path)
  107. shutil.copy(save_path, save_path.replace('.jpg', '_small.jpg'))
  108. except Exception as e:
  109. raise e
  110. url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
  111. return save_path, open_cv_image, url_path
  112. def save_raw_image_without_segment_formula(subject, datetime, img_file, analysis_type):
  113. # 随机生成新的图片名,自定义路径。
  114. ext = img_file.name.split('.')[-1]
  115. raw_name = img_file.name[0:-len(ext) - 1]
  116. file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], ext)
  117. raw_img = Image.open(img_file) # 读取上传的网络图像
  118. save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
  119. if not os.path.exists(save_dir):
  120. os.makedirs(save_dir)
  121. save_path = os.path.join(save_dir, file_name)
  122. channels = raw_img.split()
  123. # if ext == 'png' and len(channels) >= 3: # 公式ocr分割透明png
  124. # img = Image.merge("RGB", (channels[0], channels[1], channels[2]))
  125. # open_cv_image = np.array(img)
  126. # resize_img = resize_by_percent(open_cv_image, 0.5)
  127. #
  128. # else:
  129. # img = raw_img
  130. # open_cv_image = np.array(img)
  131. # resize_img = resize_by_percent(open_cv_image, 0.5)
  132. try:
  133. raw_img.save(save_path)
  134. # write_single_img(resize_img, save_path.replace('.jpg', '_small.jpg'))
  135. except Exception as e:
  136. raise e
  137. url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
  138. return save_path, url_path, raw_img
  139. def save_raw_image_in_jpeg(subject, datetime, img_file, analysis_type):
  140. # 随机生成新的图片名,自定义路径。
  141. ext = img_file.name.split('.')[-1]
  142. raw_name = img_file.name[0:-len(ext) - 1]
  143. file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], 'jpg')
  144. raw_img = Image.open(img_file) # 读取上传的网络图像
  145. save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
  146. if not os.path.exists(save_dir):
  147. os.makedirs(save_dir)
  148. save_path = os.path.join(save_dir, file_name)
  149. if raw_img.mode == 'L':
  150. channels = raw_img.split()
  151. img = Image.merge("RGB", (channels[0], channels[0], channels[0]))
  152. elif raw_img.mode == 'RGB':
  153. img = raw_img
  154. elif raw_img.mode == 'RGBA':
  155. img = Image.new("RGB", raw_img.size, (255, 255, 255))
  156. img.paste(raw_img, mask=raw_img.split()[3]) # 3 is the alpha channel
  157. else:
  158. img = raw_img
  159. open_cv_image = np.array(img)
  160. # resize_img = resize_by_percent(open_cv_image, 0.5)
  161. try:
  162. img.save(save_path)
  163. # write_single_img(resize_img, save_path.replace('.jpg', '_small.jpg'))
  164. except Exception as e:
  165. raise e
  166. url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
  167. return save_path, url_path, open_cv_image
  168. def ocr_login():
  169. def login():
  170. grant_type = 'client_credentials'
  171. client_id = settings.OCR_CLIENT_ID
  172. client_secret = settings.OCR_CLIENT_SECRET
  173. textmod = {'grant_type': grant_type, 'client_id': client_id, 'client_secret': client_secret}
  174. textmod = parse.urlencode(textmod)
  175. # 输出内容:user=admin&password=admin
  176. header_dict = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'}
  177. url = 'https://aip.baidubce.com/oauth/2.0/token'
  178. req = request.Request(url='{}{}{}'.format(url, '?', textmod), headers=header_dict)
  179. res = request.urlopen(req).read()
  180. token = eval(res.decode(encoding='utf-8'))['access_token']
  181. lastest_access_token = OcrToken(access_token=token)
  182. lastest_access_token.save()
  183. return token
  184. objects = OcrToken.objects.latest('update_time')
  185. lastest_access_token_db = objects.access_token
  186. lastest_date = objects.update_time
  187. ans_time = time.mktime(lastest_date.timetuple())
  188. update_date = settings.OCR_TOKEN_UPDATE_DATE
  189. current_time = time.time()
  190. if (ans_time + update_date * 24 * 60 * 60) > current_time:
  191. return lastest_access_token_db
  192. else:
  193. return login()
  194. def get_exam_bbox_by_tesseract(img_raw_name, img_path, subject):
  195. error_info = ''
  196. status = 1
  197. text = []
  198. lines_save_dir = img_path.replace('.jpg', '_lines')
  199. img_path = os.path.abspath(img_path)
  200. lines_save_dir = os.path.abspath(lines_save_dir)
  201. if not os.path.exists(lines_save_dir):
  202. os.makedirs(lines_save_dir)
  203. start_time = time.time()
  204. try:
  205. bbox, lines_abs_path_list = line_split(img_path, lines_save_dir, settings.TOLERANCE_PIX_NUMBER) # 分行
  206. except Exception as e:
  207. logger.error('line_split failed: {}'.format(e), exc_info=True)
  208. status = 0
  209. error_info = str(e)
  210. info = {'is_success': status, 'img_name': img_raw_name, 'coordinate': text, 'error': error_info}
  211. return info
  212. time1 = time.time()
  213. logger.info('lines_segment, cost: {}'.format(time1 - start_time))
  214. exam_group = []
  215. try:
  216. _, exam_group = group_pictures(lines_abs_path_list, subject)
  217. logger.info('exam_group info : {}'.format(exam_group))
  218. except (SystemExit, KeyboardInterrupt):
  219. raise
  220. except Exception as e:
  221. logger.error('ocr failed: {}'.format(e), exc_info=True)
  222. status = 0
  223. error_info = error_info + str(e)
  224. time2 = time.time()
  225. logger.info('exam_grouped, cost: {}'.format(time2 - time1))
  226. try:
  227. text = joint_image(img_path, bbox, exam_group)
  228. except (SystemExit, KeyboardInterrupt):
  229. raise
  230. except Exception as e:
  231. logger.error('generate coordinate info failed: {}'.format(e), exc_info=True)
  232. status = 0
  233. error_info = error_info + str(e)
  234. info = {'img_name': img_raw_name, 'coordinate': text}
  235. if error_info:
  236. info = {'img_name': img_raw_name, 'coordinate': text, 'error': error_info}
  237. logger.info('{} done'.format(img_raw_name))
  238. return status, info
  239. def get_ocr_text(access_token, img, subject=None):
  240. textmod = {'access_token': access_token}
  241. textmod = parse.urlencode(textmod)
  242. url = '{}{}{}{}{}'.format(settings.OCR_URL, settings.OCR_ACCURACY, '_basic', '?', textmod)
  243. url_general = '{}{}{}{}{}'.format(settings.OCR_URL, 'general', '_basic', '?', textmod)
  244. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  245. # image = opecv2base64(img) # 得到 byte 编码的数据
  246. image = img
  247. data = {
  248. 'image': image,
  249. 'detect_direction': 'true',
  250. 'language_type': 'CHN_ENG',
  251. }
  252. if subject == 'english':
  253. resp = requests.post(url, data=data, headers=headers).json()
  254. else:
  255. resp = requests.post(url, data=data, headers=headers).json()
  256. if resp.get('error_msg'):
  257. if 'internal error' in resp.get('error_msg'):
  258. resp = requests.post(url_general, data=data, headers=headers).json()
  259. if resp.get('error_msg'):
  260. raise Exception("ocr {}!".format(resp.get('error_msg')))
  261. else:
  262. raise Exception("ocr {}!".format(resp.get('error_msg')))
  263. words_result = resp.get('words_result')
  264. text_list = [word.get('words') for word in words_result]
  265. # words_list = {'word': text_list, 'subject': subject}
  266. return text_list
  267. def get_ocr_text_and_coordinate_in_raw_format(access_token, img):
  268. textmod = {'access_token': access_token}
  269. textmod = parse.urlencode(textmod)
  270. url = '{}{}{}{}'.format(settings.OCR_BOX_URL, settings.OCR_ACCURACY, '?', textmod)
  271. url_general = '{}{}{}{}'.format(settings.OCR_BOX_URL, 'general', '?', textmod)
  272. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  273. image_type = 'base64'
  274. group_id = 'group001'
  275. user_id = 'usr001'
  276. # image = base64.b64encode(img) # 得到 byte 编码的数据
  277. image = img
  278. data = {
  279. 'image_type': image_type,
  280. 'group_id': group_id,
  281. 'user_id': user_id,
  282. 'image': image,
  283. 'detect_direction': 'true',
  284. 'recognize_granularity': 'small',
  285. # 'vertexes_location': 'true',
  286. # 'probability': 'true'
  287. }
  288. resp = requests.post(url, data=data, headers=headers).json()
  289. if resp.get('error_msg'):
  290. if 'internal error' in resp.get('error_msg'):
  291. resp = requests.post(url_general, data=data, headers=headers).json()
  292. if resp.get('error_msg'):
  293. raise Exception("ocr {}!".format(resp.get('error_msg')))
  294. else:
  295. raise Exception("ocr {}!".format(resp.get('error_msg')))
  296. return resp
  297. def get_ocr_text_and_coordinate(access_token, img):
  298. textmod = {'access_token': access_token}
  299. textmod = parse.urlencode(textmod)
  300. url = '{}{}{}{}'.format(settings.OCR_BOX_URL, settings.OCR_ACCURACY, '?', textmod)
  301. url_general = '{}{}{}{}'.format(settings.OCR_BOX_URL, 'general', '?', textmod)
  302. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  303. image_type = 'base64'
  304. group_id = 'group001'
  305. user_id = 'usr001'
  306. # image = base64.b64encode(img) # 得到 byte 编码的数据
  307. image = img
  308. data = {
  309. 'image_type': image_type,
  310. 'group_id': group_id,
  311. 'user_id': user_id,
  312. 'image': image,
  313. 'detect_direction': 'true',
  314. # 'recognize_granularity': 'small',
  315. # 'vertexes_location': 'true',
  316. # 'probability': 'true'
  317. }
  318. resp = requests.post(url, data=data, headers=headers).json()
  319. if resp.get('error_msg'):
  320. if 'internal error' in resp.get('error_msg'):
  321. resp = requests.post(url_general, data=data, headers=headers).json()
  322. if resp.get('error_msg'):
  323. raise Exception("ocr {}!".format(resp.get('error_msg')))
  324. else:
  325. raise Exception("ocr {}!".format(resp.get('error_msg')))
  326. words_result = resp.get('words_result')
  327. text_list = [word.get('words') for word in words_result]
  328. # words_list = {'word': text_list, 'subject': subject}
  329. matrix_lt, matrix_rb = resolve_json(words_result)
  330. return text_list, matrix_lt, matrix_rb
  331. def get_ocr_text_and_coordinate_formula(img, access_token, base64=False):
  332. textmod = {'access_token': access_token}
  333. textmod = parse.urlencode(textmod)
  334. url = '{}{}{}{}'.format(settings.OCR_BOX_URL, settings.OCR_ACCURACY, '?', textmod)
  335. url_general = '{}{}{}{}'.format(settings.OCR_BOX_URL, 'general', '?', textmod)
  336. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  337. image_type = 'base64'
  338. group_id = 'group001'
  339. user_id = 'usr001'
  340. if base64:
  341. image = img
  342. else:
  343. image = opencv2base64(img)
  344. data = {
  345. 'image_type': image_type,
  346. 'group_id': group_id,
  347. 'user_id': user_id,
  348. 'image': image,
  349. 'detect_direction': 'true',
  350. 'recognize_granularity': 'small',
  351. 'language_type': 'CHN_ENG',
  352. # 'vertexes_location': 'true',
  353. # 'probability': 'true'
  354. }
  355. resp = requests.post(url, data=data, headers=headers).json()
  356. if resp.get('error_msg'):
  357. if 'internal error' in resp.get('error_msg'):
  358. resp = requests.post(url_general, data=data, headers=headers).json()
  359. if resp.get('error_msg'):
  360. raise Exception("ocr {}!".format(resp.get('error_msg')))
  361. else:
  362. raise Exception("ocr {}!".format(resp.get('error_msg')))
  363. words_result = resp.get('words_result')
  364. return words_result
  365. def resolve_json(words_result):
  366. box_list = [item[key] for item in words_result for key in item if key == 'location']
  367. matrix = np.array([0, 0, 0, 0])
  368. for box in box_list:
  369. # num_list = list(box.values())
  370. w = box.get('width')
  371. l = box.get('left')
  372. t = box.get('top')
  373. h = box.get('height')
  374. num_list = [w, t, l, h]
  375. matrix = np.vstack([matrix, np.array(num_list)])
  376. matrix = matrix[1:]
  377. matrix_w = matrix[:, 0:1]
  378. matrix_t = matrix[:, 1:2]
  379. matrix_l = matrix[:, 2:3]
  380. matrix_h = matrix[:, 3:]
  381. matrix_lt = np.hstack([matrix_l, matrix_t])
  382. matrix_wh = np.hstack([matrix_w, matrix_h])
  383. matrix_rb = matrix_lt + matrix_wh
  384. return matrix_lt, matrix_rb
  385. def group_to_coordinate(group_list, matrix_lt, matrix_rb):
  386. matrix_box_vlist = np.array([0, 0, 0, 0])
  387. for element in group_list:
  388. if element[0] < element[1]:
  389. rb = matrix_rb[element[0]:element[1]].max(axis=0)
  390. lt = matrix_lt[element[0]:element[1]].min(axis=0)
  391. matrix_box = np.hstack([lt, rb])
  392. matrix_box_vlist = np.vstack([matrix_box_vlist, matrix_box])
  393. matrix_box_vlist = matrix_box_vlist[1:]
  394. return matrix_box_vlist.tolist()
  395. def get_exam_box(img_raw_name, img_list, save_path, subject, access_token):
  396. status = 1
  397. error_info = ''
  398. box_list = []
  399. words_list_all = []
  400. group_list_all = []
  401. try:
  402. for img_part in img_list:
  403. x_bias = img_part['x_bias']
  404. y_bias = img_part['y_bias']
  405. img = img_part['img_part']
  406. words_list, matrix_lt, matrix_rb = get_ocr_text_and_coordinate(access_token, img)
  407. matrix_lt = matrix_lt + np.asarray([x_bias, y_bias])
  408. matrix_rb = matrix_rb + np.asarray([x_bias, y_bias])
  409. group_list = group_text(words_list, subject)
  410. part_box_list = group_to_coordinate(group_list, matrix_lt, matrix_rb)
  411. box_list = box_list + part_box_list
  412. words_list.append('********************************')
  413. words_list_all = words_list_all + words_list
  414. group_list_all.append(group_list)
  415. try:
  416. txt_backup_path = save_path.replace('.jpg', '.txt')
  417. words_list = [line + ',\n' for line in words_list_all]
  418. with open(txt_backup_path, 'w', encoding='utf-8') as writer:
  419. writer.writelines('subject:' + subject + '\n')
  420. writer.writelines('[\n')
  421. writer.writelines(words_list)
  422. writer.writelines(']\n')
  423. writer.writelines(str(group_list_all))
  424. logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
  425. except Exception as e:
  426. logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  427. # 记录xml坐标信息
  428. tree = ET.parse(r'./segment/exam_info/000000-template.xml') # xml tree
  429. for index_num, exam_bbox in enumerate(box_list):
  430. tree = create_xml('{:02d}'.format(index_num), tree,
  431. exam_bbox[0], exam_bbox[1], exam_bbox[2], exam_bbox[3])
  432. # print(exam_items_bbox)
  433. tree.write(save_path.replace('.jpg', '.xml'))
  434. except Exception as e:
  435. logger.error('{}试卷: {} 坐标生成失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  436. status = 0
  437. error_info = error_info + str(e)
  438. info = {'img_name': img_raw_name, 'coordinate': box_list}
  439. if error_info:
  440. info = {'img_name': img_raw_name, 'coordinate': box_list, 'error': error_info}
  441. logger.info('{} done'.format(img_raw_name))
  442. return status, info
  443. def get_exam_ocr(img_raw_name, img_list, save_path, subject, access_token):
  444. status = 1
  445. error_info = ''
  446. words_list = []
  447. for img_part in img_list:
  448. img = img_part['img_part']
  449. try:
  450. part_words_list = get_ocr_text(access_token, img, subject)
  451. except Exception as e:
  452. part_words_list = []
  453. error_info = error_info + str(e)
  454. words_list = words_list + part_words_list
  455. if len(words_list) < 1:
  456. logger.error('{}试卷: {} OCR解析失败: {}'.format(subject, img_raw_name, error_info), exc_info=True)
  457. status = 0
  458. else:
  459. try:
  460. txt_backup_path = save_path.replace('.jpg', '.txt')
  461. words_list = [line + '\n' for line in words_list]
  462. # # words_list.append(group_list)
  463. with open(txt_backup_path, 'w', encoding='utf-8') as writer:
  464. writer.writelines('subject:' + subject + '\n')
  465. writer.writelines('[\n')
  466. writer.writelines(words_list)
  467. writer.writelines(']\n')
  468. logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
  469. except Exception as e:
  470. logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  471. info = {'img_name': img_raw_name, 'text': words_list}
  472. if error_info:
  473. info = {'img_name': img_raw_name, 'text': words_list, 'error': error_info}
  474. logger.info('{} done'.format(img_raw_name))
  475. return status, info
  476. def get_exam_ocr_single(img_raw_name, img, save_path, subject, access_token):
  477. status = 1
  478. error_info = ''
  479. words_list = []
  480. try:
  481. part_words_list = get_ocr_text(access_token, img)
  482. except Exception as e:
  483. part_words_list = []
  484. error_info = error_info + str(e)
  485. words_list = words_list + part_words_list
  486. if len(words_list) < 1:
  487. logger.error('{}试卷: {} OCR解析失败: {}'.format(subject, img_raw_name, error_info), exc_info=True)
  488. status = 0
  489. else:
  490. try:
  491. txt_backup_path = save_path.replace('.jpg', '.txt')
  492. words_list = [line + ',\n' for line in words_list]
  493. # # words_list.append(group_list)
  494. with open(txt_backup_path, 'w', encoding='utf-8') as writer:
  495. writer.writelines('subject:' + subject + '\n')
  496. writer.writelines('[\n')
  497. writer.writelines(words_list)
  498. writer.writelines(']\n')
  499. logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
  500. except Exception as e:
  501. logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  502. info = {'img_name': img_raw_name, 'text': words_list}
  503. if error_info:
  504. info = {'img_name': img_raw_name, 'text': words_list, 'error': error_info}
  505. logger.info('{} done'.format(img_raw_name))
  506. return status, info
  507. def get_segment_by_ocr_once(opencv_img, token, subject, save_path, img_raw_name):
  508. img = opencv2base64(opencv_img)
  509. resp = get_ocr_text_and_coordinate_in_raw_format(token, img)
  510. if len(opencv_img.shape) == 3:
  511. opencv_img = cv2.cvtColor(opencv_img, cv2.COLOR_BGR2GRAY)
  512. test_list = get_page_text(resp['words_result'], opencv_img)
  513. status = 1
  514. error_info = ''
  515. box_list = []
  516. words_list_all = []
  517. group_list_all = []
  518. try:
  519. for one_page_text in test_list:
  520. words_list = [word.get('words') for word in one_page_text]
  521. matrix_lt, matrix_rb = resolve_json(one_page_text)
  522. group_list = group_text(words_list, subject)
  523. part_box_list = group_to_coordinate(group_list, matrix_lt, matrix_rb)
  524. box_list = box_list + part_box_list
  525. words_list.append('********************************')
  526. words_list_all = words_list_all + words_list
  527. group_list_all.append(group_list)
  528. try:
  529. txt_backup_path = save_path.replace('.jpg', '.txt')
  530. words_list = [line + '\n' for line in words_list_all]
  531. with open(txt_backup_path, 'w', encoding='utf-8') as writer:
  532. writer.writelines('subject:' + subject + '\n')
  533. writer.writelines('[\n')
  534. writer.writelines(words_list)
  535. writer.writelines(']\n')
  536. writer.writelines(str(group_list_all))
  537. logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
  538. except Exception as e:
  539. logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  540. # 记录xml坐标信息
  541. tree = ET.parse(r'./segment/exam_info/000000-template.xml') # xml tree
  542. for index_num, exam_bbox in enumerate(box_list):
  543. tree = create_xml('{:02d}'.format(index_num), tree,
  544. exam_bbox[0], exam_bbox[1], exam_bbox[2], exam_bbox[3])
  545. # print(exam_items_bbox)
  546. tree.write(save_path.replace('.jpg', '.xml'))
  547. except Exception as e:
  548. logger.error('{}试卷: {} 坐标生成失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  549. status = 0
  550. error_info = error_info + str(e)
  551. info = {'img_name': img_raw_name, 'coordinate': box_list}
  552. if error_info:
  553. info = {'img_name': img_raw_name, 'coordinate': box_list, 'error': error_info}
  554. logger.info('{} done'.format(img_raw_name))
  555. return status, info
  556. # opencv_img, token, subject, save_path, img_raw_name
  557. def get_exam_ocr_once(opencv_img, token, subject, save_path, img_raw_name):
  558. img = opencv2base64(opencv_img)
  559. resp = get_ocr_text_and_coordinate_in_raw_format(token, img)
  560. if len(opencv_img.shape) == 3:
  561. opencv_img = cv2.cvtColor(opencv_img, cv2.COLOR_BGR2GRAY)
  562. test_list = get_page_text(resp['words_result'], opencv_img)
  563. words_list = []
  564. for one_page_raw_text in test_list:
  565. one_page_words_list = [word.get('words') for word in one_page_raw_text]
  566. words_list = words_list + one_page_words_list
  567. status = 1
  568. error_info = ''
  569. if len(words_list) < 1:
  570. logger.error('{}试卷: {} OCR解析失败: {}'.format(subject, img_raw_name, error_info), exc_info=True)
  571. status = 0
  572. else:
  573. try:
  574. txt_backup_path = save_path.replace('.jpg', '.txt')
  575. words_list = [line + '\n' for line in words_list]
  576. # # words_list.append(group_list)
  577. with open(txt_backup_path, 'w', encoding='utf-8') as writer:
  578. writer.writelines('subject:' + subject + '\n')
  579. writer.writelines('[\n')
  580. writer.writelines(words_list)
  581. writer.writelines(']\n')
  582. logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
  583. except Exception as e:
  584. logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  585. info = {'img_name': img_raw_name, 'text': words_list}
  586. if error_info:
  587. info = {'img_name': img_raw_name, 'text': words_list, 'error': error_info}
  588. logger.info('{} done'.format(img_raw_name))
  589. return status, info
  590. def save_pdf_image(pdf_file, subject, time_str):
  591. name = pdf_file.name[:-4]
  592. ext0 = pdf_file.name.split('.')[-1]
  593. raw_name0 = ''.join([''.join(i) for i in pypinyin.pinyin(name, style=pypinyin.NORMAL)])
  594. save_dir0 = os.sep.join(
  595. [settings.MEDIA_ROOT, 'ocr', subject, time_str, raw_name0 + '_{}'.format(uuid.uuid4().hex[:10])])
  596. if not os.path.exists(save_dir0):
  597. os.makedirs(save_dir0)
  598. pdf_path = os.sep.join([save_dir0, raw_name0 + '.' + ext0])
  599. with open(pdf_path, 'wb') as pdfFileObj:
  600. for chunk in pdf_file.chunks():
  601. pdfFileObj.write(chunk)
  602. images_list = convert_from_path(pdf_path, dpi=200, output_folder=save_dir0,
  603. output_file='image',
  604. first_page=None, last_page=None, fmt='JPEG')
  605. upload_img_path_list = glob.glob(os.sep.join([save_dir0, '*.jpg']))
  606. try:
  607. images_list = [cv2.cvtColor(np.asarray(ele), cv2.COLOR_RGB2BGR) for ele in images_list]
  608. except Exception:
  609. images_list = [np.asarray(ele) for ele in images_list]
  610. return upload_img_path_list, images_list
  611. def save_raw_image_without_segment_pdf(subject, datetime, raw_name, img_file, analysis_type):
  612. # 随机生成新的图片名,自定义路径。
  613. file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], 'jpg')
  614. raw_img = Image.open(img_file) # 读取上传的网络图像
  615. save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
  616. if not os.path.exists(save_dir):
  617. os.makedirs(save_dir)
  618. save_path = os.path.join(save_dir, file_name)
  619. channels = raw_img.split()
  620. if len(channels) > 3:
  621. img = Image.merge("RGB", (channels[1], channels[2], channels[3]))
  622. open_cv_image = np.array(img)
  623. resize_img = resize_by_percent(open_cv_image, 0.5)
  624. else:
  625. img = raw_img
  626. open_cv_image = np.array(img)
  627. resize_img = resize_by_percent(open_cv_image, 0.5)
  628. try:
  629. img.save(save_path)
  630. # write_single_img(resize_img, save_path.replace('.jpg', '_small.jpg'))
  631. except Exception as e:
  632. raise e
  633. url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
  634. return save_path, url_path, open_cv_image
  635. def get_exam_ocr_by_penguin(img_raw_name, raw_image, size, save_path, subject):
  636. status = 1
  637. error_info = ''
  638. words_list = []
  639. try:
  640. words_list = get_ocr_english_text(raw_image, size)
  641. except Exception as e:
  642. error_info = error_info + str(e)
  643. if len(words_list) < 1:
  644. logger.error('{}试卷: {} OCR解析失败: {}'.format(subject, img_raw_name, error_info), exc_info=True)
  645. status = 0
  646. else:
  647. try:
  648. txt_backup_path = save_path.replace('.jpg', '.txt')
  649. words_list = [line + '\n' for line in words_list]
  650. # # words_list.append(group_list)
  651. with open(txt_backup_path, 'w', encoding='utf-8') as writer:
  652. writer.writelines('subject:' + subject + '\n')
  653. writer.writelines('[\n')
  654. writer.writelines(words_list)
  655. writer.writelines(']\n')
  656. logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
  657. except Exception as e:
  658. logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
  659. info = {'img_name': img_raw_name, 'text': words_list}
  660. if error_info:
  661. info = {'img_name': img_raw_name, 'text': words_list, 'error': error_info}
  662. logger.info('{} done'.format(img_raw_name))
  663. return status, info