toTxt.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. import json
  2. import os
  3. import re
  4. import requests
  5. rootdir = ""
  6. suffixes = ".csv .doc .docx .gif .jpg .jpeg .json .html .htm .pdf .png .pptx .ps .rtf .tiff .tif .txt .wav .xlsx .xls"
  7. suffixes = suffixes.split(" ")
  8. def process(txt):
  9. txt = re.sub("\n+", "\n", txt)
  10. txt = re.sub("\n第\s*[0-9]\s*页\n", "", txt)
  11. return txt
  12. def getDocs(rootdir):
  13. c = []
  14. for parent, dirnames, filenames in os.walk(rootdir):
  15. for filename in filenames:
  16. suffix = filename.split(".")[-1]
  17. if "." + suffix in suffixes:
  18. c.append(os.path.join(parent, filename))
  19. return c
  20. def ocr_trans(upload_file, save_route):
  21. url = "http://117.50.17.141/ocr"
  22. data = {}
  23. files = {"mydata": open(upload_file, "rb")}
  24. r = requests.post(url, data, files=files)
  25. if r.status_code != 200:
  26. print(r.status_code)
  27. return False
  28. try:
  29. text = json.loads(r.text)
  30. except:
  31. print(r.text)
  32. return False
  33. if text["status"] == "ERROR":
  34. print(text)
  35. return False
  36. text = text["text"]
  37. process(text)
  38. with open(save_route, "w") as fn:
  39. fn.write(text)
  40. return True
  41. def ocr_test():
  42. url = "http://117.50.17.141/ocr"
  43. data = {}
  44. rootdir = "/".join(os.path.abspath(__file__).split("/")[:-1])
  45. filename = "BERKUT金雕雷达测速仪.doc"
  46. filename = rootdir + "/" + filename
  47. print(filename)
  48. files = {"file": open(filename, "rb")}
  49. r = requests.post(url, data, files=files)
  50. print(r.text)
  51. print(r.status_code)
  52. def ocr_transAll(upload_route=None, save_route=None):
  53. if upload_route == None:
  54. upload_route = "/".join(os.path.abspath(__file__).split("/")[:-1])
  55. if save_route == None:
  56. save_route = "/".join(os.path.abspath(__file__).split("/")[:-1])
  57. docs = getDocs(upload_route)
  58. for doc in docs:
  59. file_name = doc.split("/")[-1]
  60. file_name = ".".join(file_name.split(".")[:-1])
  61. save_name = save_route + "/" + file_name + ".txt"
  62. if not ocr_trans(doc, save_name):
  63. print(file_name + "转换失败")