123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123 |
- # !/usr/bin/env/python
- # -*- coding:utf-8 -*-
- from flask import Flask, render_template, request
- from datetime import timedelta
- import requests
- from utils.img2latex import get_ocrlatex_by_url
- headers = {'Pragma': 'no-cache', 'Cache-Control': 'no-cache',
- 'X-USER-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIyMCIsInVzZXJOYW1lIjoiSFgwMDEyMTM4IiwiZXhwIjoxNTY2Mzg3ODA1LCJ1c2VySWQiOiIyMCIsImlhdCI6MTU2NjM3MzQwNX0.9w__2RGncpdMvbiQAmk75ThgPwDaTXF1VyY1xKVs_zFduUEoJ3_6X2q3ZVYsFIpChKeLEWelhwlmMxEP4lU9QA',
- 'Origin': 'http://xx.cn',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
- 'Accept': '*/*',
- # 'Content-Type': 'multipart/form-data; boundary=----WebKitFormBoundarycDnnz7eKalCPv6GJ',
- 'Referer': '', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'}
- url = "http://192.168.1.140:18085/upload"
- import json
- data = {'callback_url': "http://zsytk.dev.xueping.com/wordapi/index/wid/9668",
- "mydata": json.dumps([{"file_type": "", "file_url": "http://zsytk.dev.xueping.com/data/word/2020/08/5f327c08f3576.doc"}],ensure_ascii=False).encode("utf-8")
- }
- # print(requests.Request('POST', url, data=data, json=json).prepare().body.decode(
- # 'ascii')) # 打印字段名和类型
- # r = requests.post(url, data=data, headers=headers)
- # print(r.text)
- #
- # { %
- # for sub in item[2:5] %}
- # { % if "src" in sub %}
- # < img
- # src = "data:;base64,{{ img_stream }}" >
- # { % endif %}
- # { % endfor %}
- import re
- pattern_2 = re.compile(
- r"(([CDE]\s*[..、、]|\([CDE]\)).+?)(?<![::])\s\(([1-9]|1[0-9])\)(.+?([是为有]|等于)[((]\s*[))]\n)")
- w1 = "C.jfg (3)mvm等于()\n"
- # p1=re.search(r"每(小?题|空)[\u4e00-\u9fa5]{,5}(\d[\d.]{,2}\d{,2})分", "二、填空题(每空2分,共8分)")
- # print(p1)
- # p2 = re.findall("([1-9]|[1-4][0-9])[))]?\s*(?=题)", "33题")
- # print(p2)
- #
- # print(list(range(1,2)))
- # print("{0}{1}".format(3,5))
- # def fun1(nn, **kwargs):
- # print(kwargs)
- # print(kwargs['index'])
- # fun1(1, all_type=1, num=2, index=3)
- # print(2==2>3)
- # p1 = re.search("(?<![::..、、])\s+([1-9]|[1-4][0-9])\s*[..、、](?!png)", 'yy 49.png')
- # print(p1)
- # items_str = '\n1.yy22.pn'
- #
- # for no in re.findall(r'\n+\s*([1-9]|[1-4][0-9])\s*[..、、]|(?<![::..、、])\s*([1-9]|[1-4][0-9])\s*[..、、](?!png)', items_str):
- # print(no)
- #
- # con_id_line = [] # 题号的行索引,第几行
- # topicno = [] # 题号序列
- # topicno_line_dict = dict(zip([], [])) # 题号to行索引字典
- # print(len(topicno_line_dict))
- # print("$\\frac")
- #
- # w_info = re.search(' width="(\d+[.\d]*?)\s*([pxt]?)"', ' width="16"')
- # print(w_info.group(1),w_info.group(2))
- #
- # ee = [1,2,3]
- # ee=sorted(ee,reverse=True)
- # print(ee)
- # from PIL import Image
- # local_p = r"F:\zwj\Text_Structure\img_folder\5fc64a514994183dda7e74ea\new_image1624343877418770.png"
- # w = Image.open(local_p)
- # w.close()
- from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
- kk = open(r"F:\zwj\parse_online58\res_folder\76736f316fe30323c8ddc51cf2b3f392__2021_08_30_10_15_58__828__1870.json",
- 'r', encoding='utf-8')
- con=json.loads(kk.read())
- item_str = str(con['items'][4])
- item_str = re.sub('(<img src=".*?".*? data-latex=)"\$.+?\$"',r'\1""', item_str)
- print(eval(item_str))
- print('------------------------------------')
- all_imgs_no_latex = re.findall(r'(<img src="(.*?)".*? data-latex="\$?\s*\$?")', item_str)
- if all_imgs_no_latex:
- imgurl_no_latex = [k[1] for k in all_imgs_no_latex]
- imgurl_no_latex = list(set(imgurl_no_latex))
- print(imgurl_no_latex)
- print('--------------------------------')
- # mathpixs = []
- # if len(imgurl_no_latex) <= 20:
- # mathpixs.extend([get_ocrlatex_by_url(i) for i in imgurl_no_latex])
- # else:
- # if len(imgurl_no_latex) <= 50:
- # executor = ThreadPoolExecutor(max_workers=2) # 开2个线程会稍微快点
- # else:
- # executor = ThreadPoolExecutor(max_workers=4)
- # for data in executor.map(get_ocrlatex_by_url, imgurl_no_latex):
- # mathpixs.append(data)
- mathpixs = [' 1: 1 ', ' b ', ')', ' 1: 1 ', '', ' a^{2}: b^{2} ', '(', 'R _ { 1 } = \\frac { p h } { a ^ { 2 } }', 'R _ { 2 } = \\frac { p b } { b h } = \\frac { p } { h }', ' a: b ', ' h ', ' R_{1}: R_{2}=b^{2}: a^{2} ', ' R_{2}=\\frac{\\rho h}{b^{2}} ', ' a: b ', ' a ']
- def sub1(ss):
- new_ss = ss.group(1) + ' data-type="math-ocr" ocr-latex="$'+mathpixs[mi]+'$"'+ss.group(2)
- return new_ss.replace(' data-type="math"', "")
- # r'\1 ocr-latex="${}$"'.format(mathpixs[mi])
- for mi, m in enumerate(imgurl_no_latex):
- # item_str = re.sub(r'(<img src="' + m + r'".*?) data-latex="\$?\s*\$?"',
- # sub1, item_str)
- if mathpixs[mi]:
- item_str = re.sub(r'(<img src="{}".*?) data-latex="\$?\s*\$?"(.*?(?<=")\s*/?>)'.format(m),
- sub1, item_str)
- items_list = eval(item_str)
- print(items_list)
|