123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475 |
- import json
- import os
- import re
- import requests
- rootdir = ""
- suffixes = ".csv .doc .docx .gif .jpg .jpeg .json .html .htm .pdf .png .pptx .ps .rtf .tiff .tif .txt .wav .xlsx .xls"
- suffixes = suffixes.split(" ")
- def process(txt):
- txt = re.sub("\n+", "\n", txt)
- txt = re.sub("\n第\s*[0-9]\s*页\n", "", txt)
- return txt
- def getDocs(rootdir):
- c = []
- for parent, dirnames, filenames in os.walk(rootdir):
- for filename in filenames:
- suffix = filename.split(".")[-1]
- if "." + suffix in suffixes:
- c.append(os.path.join(parent, filename))
- return c
- def ocr_trans(upload_file, save_route):
- url = "http://117.50.17.141/ocr"
- data = {}
- files = {"mydata": open(upload_file, "rb")}
- r = requests.post(url, data, files=files)
- if r.status_code != 200:
- print(r.status_code)
- return False
- try:
- text = json.loads(r.text)
- except:
- print(r.text)
- return False
- if text["status"] == "ERROR":
- print(text)
- return False
- text = text["text"]
- process(text)
- with open(save_route, "w") as fn:
- fn.write(text)
- return True
- def ocr_test():
- url = "http://117.50.17.141/ocr"
- data = {}
- rootdir = "/".join(os.path.abspath(__file__).split("/")[:-1])
- filename = "BERKUT金雕雷达测速仪.doc"
- filename = rootdir + "/" + filename
- print(filename)
- files = {"file": open(filename, "rb")}
- r = requests.post(url, data, files=files)
- print(r.text)
- print(r.status_code)
- def ocr_transAll(upload_route=None, save_route=None):
- if upload_route == None:
- upload_route = "/".join(os.path.abspath(__file__).split("/")[:-1])
- if save_route == None:
- save_route = "/".join(os.path.abspath(__file__).split("/")[:-1])
- docs = getDocs(upload_route)
- for doc in docs:
- file_name = doc.split("/")[-1]
- file_name = ".".join(file_name.split(".")[:-1])
- save_name = save_route + "/" + file_name + ".txt"
- if not ocr_trans(doc, save_name):
- print(file_name + "转换失败")
|