baidu_ocr.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. import os
  2. import time
  3. import datetime
  4. import re
  5. import base64
  6. import requests
  7. import my_config
  8. # def get_image_base64(img_path):
  9. # with open(img_path, "rb") as f:
  10. # base64_byte = base64.b64encode(f.read())
  11. # return base64_byte.decode('utf-8')
  12. def get_image_base64(image):
  13. """
  14. 将各来源的图片转为base64编码,先默认为本地图片路径
  15. :param image:
  16. :return:
  17. """
  18. print("image:", str(image))
  19. if re.search("^https?:", str(image)): # 远程图片
  20. # filebyte = requests.get(image).content
  21. return "online"
  22. elif re.search("^[A-H]:", str(image)): # 本地图片
  23. local_img = image
  24. filebyte = open(local_img, 'rb').read()
  25. else: # 二进制文件流
  26. filebyte = image.read()
  27. # encoded = base64.b64encode(open(local_img, 'rb').read())
  28. base64_data = base64.b64encode(filebyte).rstrip().decode('utf-8')
  29. return base64_data
  30. class BaiduRecognitionApi:
  31. def __init__(self):
  32. # client_id 为官网获取的AK, client_secret 为官网获取的SK
  33. # 谢易:
  34. # host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=OuO8mVZNIkDCP9eDQgF8txER&client_secret=2Wv28a9WBDKvkoXvcHMINNWZt8QdOzZg'
  35. # AK = "cGkyXvUHlccfVazpFNV7cRyp"
  36. # SK = "LSroI0zDzmOPZbzcs33Xb51p7oTak3NM"
  37. # host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={}&client_secret={}'.format(
  38. # AK, SK)
  39. # response = requests.get(host)
  40. # if response.status_code == 200:
  41. # result = response.json()
  42. # print(result)
  43. # self.access_token = result["access_token"]
  44. # else:
  45. # print("获取Access Token失败")
  46. # self.access_token = ""
  47. # self.access_token = "24.226efa451287e58f7fe02970a256b91c.2592000.1673679089.282335-27265553" # ZWJ
  48. # self.access_token = "24.d589893671869cd5b059c25bb567c7ca.2592000.1677722635.282335-21782130" # XY
  49. self.host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=OuO8mVZNIkDCP9eDQgF8txER&client_secret=2Wv28a9WBDKvkoXvcHMINNWZt8QdOzZg'
  50. self.last_update_time = my_config.last_token_update_time
  51. self.expires_in = my_config.bce_expires_in
  52. self.access_token = my_config.bce_access_token
  53. def update_access_token(self):
  54. """
  55. 判断是否需要更新access_token,默认 {expires_in * 0.9} 每27天更新一次
  56. 默认 expires_in = 2592000为30天,一天为86400
  57. :return:
  58. """
  59. if int(time.time() - self.last_update_time) >= self.expires_in * 0.9 or not self.access_token:
  60. try:
  61. print(self.host)
  62. response = requests.get(self.host)
  63. if response.status_code == 200:
  64. self.last_update_time = time.time()
  65. my_config.last_token_update_time = self.last_update_time # 更新配置文件里上一次更新时间
  66. result = response.json()
  67. # print(result)
  68. self.access_token = result["access_token"]
  69. self.expires_in = int(result["expires_in"])
  70. my_config.bce_access_token = self.access_token # 更新配置文件里的token
  71. my_config.bce_expires_in = self.expires_in
  72. print(f"更新access_token: {self.access_token} expires_in: {self.expires_in}")
  73. else:
  74. print("获取Access Token失败")
  75. self.access_token = ""
  76. except Exception as ex:
  77. print("获取Access Token异常", ex)
  78. self.access_token = ""
  79. else:
  80. pass
  81. def recognition(self, in_img_path):
  82. """
  83. 手写文字识别,百度提供了3种图片上传格式:image、url、pdf_file
  84. :param in_img_path: 图片路径
  85. :return:
  86. """
  87. tt0 = time.time()
  88. self.update_access_token()
  89. update_time = time.time() - tt0
  90. access_token = self.access_token
  91. if len(access_token) < 1:
  92. return
  93. request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/handwriting"
  94. tt1 = time.time()
  95. image_base64 = get_image_base64(in_img_path)
  96. print("读取图片时间:", time.time() - tt1)
  97. params = {"url": in_img_path} if image_base64 == "online" else {"image": image_base64}
  98. # access_token = '[调用鉴权接口获取的token]'
  99. request_url = request_url + "?access_token=" + access_token
  100. headers = {'content-type': 'application/x-www-form-urlencoded'}
  101. while True:
  102. post_stime = str(datetime.datetime.now())
  103. response = requests.post(request_url, data=params, headers=headers)
  104. post_cost_time = time.time() - tt1
  105. print("调接口消费时间:", post_cost_time)
  106. if response.status_code == 200:
  107. result = response.json()
  108. print(result)
  109. if "words_result" in result and result["words_result"]:
  110. res_word = ";".join([i["words"] for i in result["words_result"]])
  111. return res_word, result, post_stime, update_time
  112. elif "Open api qps request limit reached" in str(result):
  113. time.sleep(0.2)
  114. else:
  115. return "", 'ocr异常1:' + str(result), post_stime, update_time
  116. else:
  117. return "", 'ocr异常2:' + str(response.text), post_stime, update_time
  118. if __name__ == '__main__':
  119. from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
  120. api = BaiduRecognitionApi()
  121. st1 = time.time()
  122. # image_path = r"http://zxhx-1302712961.cos.ap-shanghai.myqcloud.com/ai/review/image/20220816/a33a4179f59344b1b72cce222d1b4be5.png"
  123. # image_path1 = r"http://zxhx-n-1302712961.cos.ap-beijing.myqcloud.com/PC_Prod/client_SanFang_cutimg/2974/936118420988289024/864746148036284881/27.jpg"
  124. image_path2 = r"http://zxhx-1302712961.cos.ap-shanghai.myqcloud.com/ai/review/image/20220921/a781d0b14b6f4480a75c66314ef4e945.jpg"
  125. image_path = r"http://zxhx-m-1302712961.cos.ap-nanjing.myqcloud.com/PC_Prod/client_SanFang_cutimg/2031/953235651420184576/886480770777941106/124.jpg"
  126. print(api.recognition(image_path))
  127. # def ocr_api(imp):
  128. # a = api.recognition(imp)
  129. # return a
  130. # #
  131. # with ThreadPoolExecutor(max_workers=3) as t:
  132. # all_png_info = [t.submit(ocr_api, arg) for arg in [image_path2] * 6]
  133. print("调接口时间:", time.time() - st1)
  134. # AK = "cGkyXvUHlccfVazpFNV7cRyp"
  135. # SK = "LSroI0zDzmOPZbzcs33Xb51p7oTak3NM"
  136. # host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={}&client_secret={}'.format(
  137. # AK, SK)
  138. #
  139. # # host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=【官网获取的AK】&client_secret=【官网获取的SK】'
  140. # response = requests.get(host)
  141. # if response:
  142. # print(response.json())