lighttxu 4 سال پیش
کامیت
b8401ddb98
100فایلهای تغییر یافته به همراه16669 افزوده شده و 0 حذف شده
  1. 64 0
      .gitignore
  2. 4 0
      .idea/.gitignore
  3. 25 0
      .idea/dataSources.local.xml
  4. 33 0
      .idea/dataSources.xml
  5. 31 0
      .idea/deployment.xml
  6. 46 0
      .idea/exam_segment_django.iml
  7. 15 0
      .idea/inspectionProfiles/Project_Default.xml
  8. 10 0
      .idea/misc.xml
  9. 8 0
      .idea/modules.xml
  10. 7 0
      .idea/other.xml
  11. 6 0
      .idea/vcs.xml
  12. 5 0
      README.md
  13. BIN
      db.sqlite3
  14. 0 0
      exam_segment_django/__init__.py
  15. 146 0
      exam_segment_django/settings.py
  16. 28 0
      exam_segment_django/urls.py
  17. 16 0
      exam_segment_django/wsgi.py
  18. 15 0
      manage.py
  19. 0 0
      segment/__init__.py
  20. 3 0
      segment/admin.py
  21. 5 0
      segment/apps.py
  22. 14 0
      segment/exam_info/000000-template.xml
  23. 69 0
      segment/form.py
  24. 3 0
      segment/formula/__init__.py
  25. 273 0
      segment/formula/formula_segment.py
  26. 321 0
      segment/formula/formula_segment_and_show.py
  27. 32 0
      segment/formula/mathpix_ocr.py
  28. 2 0
      segment/image_operation/__init__.py
  29. 538 0
      segment/image_operation/exam_segment.py
  30. 15 0
      segment/image_operation/img_urlcode.py
  31. 298 0
      segment/image_operation/pre_segment.py
  32. 47 0
      segment/image_operation/segment.py
  33. 94 0
      segment/image_operation/split_lines.py
  34. 207 0
      segment/image_operation/utils.py
  35. 53 0
      segment/logging.conf
  36. 11 0
      segment/logging_config.py
  37. 25 0
      segment/migrations/0001_initial.py
  38. 18 0
      segment/migrations/0002_auto_20181010_1008.py
  39. 21 0
      segment/migrations/0003_ocrtoken.py
  40. 18 0
      segment/migrations/0004_auto_20181025_1329.py
  41. 18 0
      segment/migrations/0005_auto_20181025_1332.py
  42. 22 0
      segment/migrations/0006_auto_20181025_1341.py
  43. 27 0
      segment/migrations/0007_sheetbigboxes.py
  44. 40 0
      segment/migrations/0007_sheetbigboxes_sheetboxes.py
  45. 26 0
      segment/migrations/0008_sheetboxes.py
  46. 0 0
      segment/migrations/__init__.py
  47. 39 0
      segment/models.py
  48. 43 0
      segment/ocr/BD_OCR.py
  49. 2 0
      segment/ocr/__init__.py
  50. 147 0
      segment/ocr/group_pictures.py
  51. 246 0
      segment/ocr/group_text.py
  52. 0 0
      segment/ocr/luo_ocr/__init__.py
  53. 64 0
      segment/ocr/luo_ocr/ocr.py
  54. 85 0
      segment/ocr/luo_ocr/preprocess.py
  55. 67 0
      segment/ocr/luo_ocr/sheetocr.py
  56. 144 0
      segment/ocr/penguin_ocr.py
  57. 36 0
      segment/ocr/split_topic_en.py
  58. 165 0
      segment/ocr/type_config.txt
  59. 799 0
      segment/server.py
  60. 3 0
      segment/sheet_resolve/__init__.py
  61. 3 0
      segment/sheet_resolve/analysis/__init__.py
  62. 3 0
      segment/sheet_resolve/analysis/anchor/__init__.py
  63. 644 0
      segment/sheet_resolve/analysis/anchor/marker_detection.py
  64. 624 0
      segment/sheet_resolve/analysis/anchor/util.py
  65. 3 0
      segment/sheet_resolve/analysis/choice/__init__.py
  66. 95 0
      segment/sheet_resolve/analysis/choice/analysis_choice.py
  67. 490 0
      segment/sheet_resolve/analysis/choice/choice_box.py
  68. 256 0
      segment/sheet_resolve/analysis/choice/choice_line_box.py
  69. 211 0
      segment/sheet_resolve/analysis/choice/choice_m_row_column.py
  70. 496 0
      segment/sheet_resolve/analysis/choice/get_title_number_by_choice_m.py
  71. 3 0
      segment/sheet_resolve/analysis/cloze/__init__.py
  72. 101 0
      segment/sheet_resolve/analysis/cloze/analysis_cloze.py
  73. 146 0
      segment/sheet_resolve/analysis/cloze/cloze_box.py
  74. 117 0
      segment/sheet_resolve/analysis/cloze/cloze_line_box.py
  75. 3 0
      segment/sheet_resolve/analysis/correct/__init__.py
  76. 244 0
      segment/sheet_resolve/analysis/correct/coordinates_correct.py
  77. 479 0
      segment/sheet_resolve/analysis/correct/coordinates_correct_pyinstaller.py
  78. 5 0
      segment/sheet_resolve/analysis/correct/run.bat
  79. 3 0
      segment/sheet_resolve/analysis/exam_number/__init__.py
  80. 239 0
      segment/sheet_resolve/analysis/exam_number/exam_number_box.py
  81. 234 0
      segment/sheet_resolve/analysis/exam_number/exam_number_row_column.py
  82. 3 0
      segment/sheet_resolve/analysis/info_section/__init__.py
  83. 43 0
      segment/sheet_resolve/analysis/info_section/info_section.py
  84. 466 0
      segment/sheet_resolve/analysis/resolve.py
  85. 3 0
      segment/sheet_resolve/analysis/sheet/__init__.py
  86. 270 0
      segment/sheet_resolve/analysis/sheet/analysis_sheet.py
  87. 671 0
      segment/sheet_resolve/analysis/sheet/choice_infer.py
  88. 2932 0
      segment/sheet_resolve/analysis/sheet/ocr_key_words.py
  89. 218 0
      segment/sheet_resolve/analysis/sheet/ocr_sheet.py
  90. 485 0
      segment/sheet_resolve/analysis/sheet/sheet_adjust.py
  91. 1192 0
      segment/sheet_resolve/analysis/sheet/sheet_infer.py
  92. 534 0
      segment/sheet_resolve/analysis/sheet/sheet_points.py
  93. 284 0
      segment/sheet_resolve/analysis/sheet/sheet_points_total.py
  94. 3 0
      segment/sheet_resolve/analysis/solve/__init__.py
  95. 119 0
      segment/sheet_resolve/analysis/solve/mark_box.py
  96. 32 0
      segment/sheet_resolve/analysis/solve/mark_line_box.py
  97. 118 0
      segment/sheet_resolve/analysis/solve/optional_solve.py
  98. 14 0
      segment/sheet_resolve/labels/000000-template.xml
  99. 2 0
      segment/sheet_resolve/tools/__init__.py
  100. 382 0
      segment/sheet_resolve/tools/brain_api.py

+ 64 - 0
.gitignore

@@ -0,0 +1,64 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+*.pbtxt
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# model
+segment/sheet_resolve/model
+# upload images
+segment/exam_image

+ 4 - 0
.idea/.gitignore

@@ -0,0 +1,4 @@
+# Datasource local storage ignored files
+/dataSources/
+# Default ignored files
+/workspace.xml

+ 25 - 0
.idea/dataSources.local.xml

@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="dataSourceStorageLocal">
+    <data-source name="Django default" uuid="196c9e3b-952d-4e88-b2b8-8148410faad1">
+      <database-info product="SQLite" version="3.20.1" jdbc-version="2.1" driver-name="SQLite JDBC" driver-version="3.20.1.1" dbms="SQLITE" exact-version="3.20.1" />
+      <case-sensitivity plain-identifiers="mixed" quoted-identifiers="mixed" />
+      <auth-required>false</auth-required>
+      <schema-mapping>
+        <introspection-scope>
+          <node kind="schema" qname="main" />
+        </introspection-scope>
+      </schema-mapping>
+    </data-source>
+    <data-source name="db [2]" uuid="d3628523-582a-439a-a8a9-094e8504e34a">
+      <database-info product="SQLite" version="3.20.1" jdbc-version="2.1" driver-name="SQLite JDBC" driver-version="3.20.1.1" dbms="SQLITE" exact-version="3.20.1" />
+      <case-sensitivity plain-identifiers="mixed" quoted-identifiers="mixed" />
+      <auth-required>false</auth-required>
+      <schema-mapping>
+        <introspection-scope>
+          <node kind="schema" qname="@" />
+        </introspection-scope>
+      </schema-mapping>
+    </data-source>
+  </component>
+</project>

+ 33 - 0
.idea/dataSources.xml

@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="DataSourceManagerImpl" format="xml" multifile-model="true">
+    <data-source source="LOCAL" name="Django default" uuid="196c9e3b-952d-4e88-b2b8-8148410faad1">
+      <driver-ref>sqlite.xerial</driver-ref>
+      <synchronize>true</synchronize>
+      <imported>true</imported>
+      <remarks>$PROJECT_DIR$/exam_segment_django/settings.py</remarks>
+      <jdbc-driver>org.sqlite.JDBC</jdbc-driver>
+      <jdbc-url>jdbc:sqlite:D:\project\exam_segment_django\db.sqlite3</jdbc-url>
+      <driver-properties>
+        <property name="enable_load_extension" value="true" />
+      </driver-properties>
+    </data-source>
+    <data-source source="LOCAL" name="db [2]" uuid="d3628523-582a-439a-a8a9-094e8504e34a">
+      <driver-ref>sqlite.xerial</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>org.sqlite.JDBC</jdbc-driver>
+      <jdbc-url>jdbc:sqlite:D:\project\exam-segment-django\db.sqlite3</jdbc-url>
+      <driver-properties>
+        <property name="enable_load_extension" value="true" />
+      </driver-properties>
+      <libraries>
+        <library>
+          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.20.1.1/sqlite-jdbc-3.20.1.1.jar</url>
+        </library>
+        <library>
+          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.20.1.1/xerial-sqlite-license.txt</url>
+        </library>
+      </libraries>
+    </data-source>
+  </component>
+</project>

+ 31 - 0
.idea/deployment.xml

@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PublishConfigData" autoUpload="Always" serverName="ubuntu@192.168.1.208:22">
+    <serverData>
+      <paths name="ubuntu-station">
+        <serverdata>
+          <mappings>
+            <mapping local="D:/Anaconda3" web="/" />
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="ubuntu@192.168.1.167:22">
+        <serverdata>
+          <mappings>
+            <mapping local="D:/Anaconda3" web="/" />
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="ubuntu@192.168.1.208:22">
+        <serverdata>
+          <mappings>
+            <mapping deploy="/tmp/pycharm_project_586" local="$PROJECT_DIR$" />
+          </mappings>
+        </serverdata>
+      </paths>
+    </serverData>
+    <option name="myAutoUpload" value="ALWAYS" />
+  </component>
+</project>

+ 46 - 0
.idea/exam_segment_django.iml

@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="FacetManager">
+    <facet type="django" name="Django">
+      <configuration>
+        <option name="rootFolder" value="$MODULE_DIR$" />
+        <option name="settingsModule" value="exam_segment_django/settings.py" />
+        <option name="manageScript" value="$MODULE_DIR$/manage.py" />
+        <option name="environment" value="&lt;map/&gt;" />
+        <option name="doNotUseTestRunner" value="false" />
+        <option name="trackFilePattern" value="migrations" />
+      </configuration>
+    </facet>
+  </component>
+  <component name="NewModuleRootManager">
+    <content url="file://D:/Anaconda3">
+      <excludeFolder url="file://D:/Anaconda3" />
+    </content>
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/segment/exam_image" />
+      <excludeFolder url="file://$MODULE_DIR$/segment/exam_info" />
+      <excludeFolder url="file://$MODULE_DIR$/segment/sheet_resolve/images" />
+      <excludeFolder url="file://$MODULE_DIR$/segment/sheet_resolve/labels" />
+      <excludeFolder url="file://$MODULE_DIR$/segment/sheet_resolve/model" />
+      <excludeFolder url="file://$MODULE_DIR$/segment/upload_images" />
+      <excludeFolder url="file://$MODULE_DIR$/segment/xml_labels" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.6" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="renderExternalDocumentation" value="true" />
+  </component>
+  <component name="TemplatesService">
+    <option name="TEMPLATE_CONFIGURATION" value="Django" />
+    <option name="TEMPLATE_FOLDERS">
+      <list>
+        <option value="$MODULE_DIR$/templates" />
+      </list>
+    </option>
+  </component>
+  <component name="TestRunnerService">
+    <option name="projectConfiguration" value="pytest" />
+    <option name="PROJECT_TEST_RUNNER" value="py.test" />
+  </component>
+</module>

+ 15 - 0
.idea/inspectionProfiles/Project_Default.xml

@@ -0,0 +1,15 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="HtmlUnknownAttribute" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="myValues">
+        <value>
+          <list size="1">
+            <item index="0" class="java.lang.String" itemvalue="text-align" />
+          </list>
+        </value>
+      </option>
+      <option name="myCustomValuesEnabled" value="true" />
+    </inspection_tool>
+  </profile>
+</component>

+ 10 - 0
.idea/misc.xml

@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
+  <component name="PyCharmProfessionalAdvertiser">
+    <option name="shown" value="true" />
+  </component>
+</project>

+ 8 - 0
.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/exam_segment_django.iml" filepath="$PROJECT_DIR$/.idea/exam_segment_django.iml" />
+    </modules>
+  </component>
+</project>

+ 7 - 0
.idea/other.xml

@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PySciProjectComponent">
+    <option name="PY_SCI_VIEW" value="true" />
+    <option name="PY_SCI_VIEW_SUGGESTED" value="true" />
+  </component>
+</project>

+ 6 - 0
.idea/vcs.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

+ 5 - 0
README.md

@@ -0,0 +1,5 @@
+#exam-segment-django
+该项目目前实现了对试卷信息的分割:  
+* 试卷块分割,一张试卷有左右两页,分成两页(待改进);
+* 试卷文字信息识别;
+* 根据试卷中的文字信息分割题目;

BIN
db.sqlite3


+ 0 - 0
exam_segment_django/__init__.py


+ 146 - 0
exam_segment_django/settings.py

@@ -0,0 +1,146 @@
+"""
+Django settings for exam_segment_django project.
+
+Generated by 'django-admin startproject' using Django 2.1.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/2.1/topics/settings/
+
+For the full list of settings and their values, see
+https://docs.djangoproject.com/en/2.1/ref/settings/
+"""
+
+import os
+
+# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+
+# Quick-start development settings - unsuitable for production
+# See https://docs.djangoproject.com/en/2.1/howto/deployment/checklist/
+
+# SECURITY WARNING: keep the secret key used in production secret!
+SECRET_KEY = '3t_*ihwd7qfdwj0-j8t+#a48h$tw_*9gxpv9cjrvbai7h!0!1t'
+# SECRET_KEY = os.environ.get('DJ_SECRET_KEY')
+
+# SECURITY WARNING: don't run with debug turned on in production!
+DEBUG = True
+
+ALLOWED_HOSTS = ['117.50.37.71', '127.0.0.1', 'localhost',
+                 '0.0.0.0:8000', '192.168.1.208:8000', '192.168.1.208']
+
+
+# Application definition
+
+INSTALLED_APPS = [
+    'django.contrib.admin',
+    'django.contrib.auth',
+    'django.contrib.contenttypes',
+    'django.contrib.sessions',
+    'django.contrib.messages',
+    'django.contrib.staticfiles',
+    'segment.apps.SegmentConfig',
+]
+
+MIDDLEWARE = [
+    'django.middleware.security.SecurityMiddleware',
+    'django.contrib.sessions.middleware.SessionMiddleware',
+    'django.middleware.common.CommonMiddleware',
+    'django.middleware.csrf.CsrfViewMiddleware',
+    'django.contrib.auth.middleware.AuthenticationMiddleware',
+    'django.contrib.messages.middleware.MessageMiddleware',
+    'django.middleware.clickjacking.XFrameOptionsMiddleware',
+]
+
+ROOT_URLCONF = 'exam_segment_django.urls'
+
+TEMPLATES = [
+    {
+        'BACKEND': 'django.template.backends.django.DjangoTemplates',
+        'DIRS': [os.path.join(BASE_DIR, 'templates')],
+        'APP_DIRS': True,
+        'OPTIONS': {
+            'context_processors': [
+                'django.template.context_processors.debug',
+                'django.template.context_processors.request',
+                'django.contrib.auth.context_processors.auth',
+                'django.contrib.messages.context_processors.messages',
+            ],
+        },
+    },
+]
+
+WSGI_APPLICATION = 'exam_segment_django.wsgi.application'
+
+
+# Database
+# https://docs.djangoproject.com/en/2.1/ref/settings/#databases
+
+DATABASES = {
+    'default': {
+        'ENGINE': 'django.db.backends.sqlite3',
+        'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
+    }
+}
+
+
+# Password validation
+# https://docs.djangoproject.com/en/2.1/ref/settings/#auth-password-validators
+
+AUTH_PASSWORD_VALIDATORS = [
+    {
+        'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
+    },
+]
+
+
+# Internationalization
+# https://docs.djangoproject.com/en/2.1/topics/i18n/
+
+LANGUAGE_CODE = 'en-us'
+
+TIME_ZONE = 'Asia/Shanghai'
+
+USE_I18N = True
+
+USE_L10N = True
+
+USE_TZ = False
+
+
+# Static files (CSS, JavaScript, Images)
+# https://docs.djangoproject.com/en/2.1/howto/static-files/
+
+STATIC_URL = '/static/'
+
+MEDIA_ROOT = os.path.join(BASE_DIR, 'segment', 'exam_image').replace('\\', '/')
+MEDIA_URL = '/exam_image/'
+
+TEMPLATES_ROOT = os.path.join(BASE_DIR, 'templates').replace('\\', '/')
+
+# segment-app settings
+
+TOLERANCE_PIX_NUMBER = 1
+RESIZE_RADIO = 1.0  # (0~1]
+
+OCR_BOX_URL = 'https://aip.baidubce.com/rest/2.0/ocr/v1/'
+OCR_URL = 'https://aip.baidubce.com/rest/2.0/ocr/v1/'
+
+# OCR_ACCURACY = 'general'
+OCR_ACCURACY = 'accurate'
+# OCR_CLIENT_ID = 'edZmhwHUTHLgrWdaxEQ72FfY'
+# OCR_CLIENT_SECRET = 'qcEtvS0dRygSF2Pa9KQjbMQcjPKBqGIE'
+OCR_CLIENT_ID = 'AVH7VGKG8QxoSotp6wG9LyZq'
+OCR_CLIENT_SECRET = 'gG7VYvBWLU8Rusnin8cS8Ta4dOckGFl6'
+OCR_TOKEN_UPDATE_DATE = 10
+
+LOGGING_TYPE = 'production'

+ 28 - 0
exam_segment_django/urls.py

@@ -0,0 +1,28 @@
+"""exam_segment_django URL Configuration
+
+The `urlpatterns` list routes URLs to views. For more information please see:
+    https://docs.djangoproject.com/en/2.1/topics/http/urls/
+Examples:
+Function views
+    1. Add an import:  from my_app import views
+    2. Add a URL to urlpatterns:  path('', views.home, name='home')
+Class-based views
+    1. Add an import:  from other_app.views import Home
+    2. Add a URL to urlpatterns:  path('', Home.as_view(), name='home')
+Including another URLconf
+    1. Import the include() function: from django.urls import include, path
+    2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
+"""
+from django.conf import settings
+from django.conf.urls import url
+from django.conf.urls.static import static
+from django.contrib import admin
+from django.urls import include
+from django.urls import path
+from django.views.static import serve
+
+urlpatterns = [
+    path('admin/', admin.site.urls),
+    path('segment/', include('segment.urls')),
+    url(r'^exam_image/(?P<path>.*)$', serve, {'document_root': settings.MEDIA_ROOT}),
+] + static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)

+ 16 - 0
exam_segment_django/wsgi.py

@@ -0,0 +1,16 @@
+"""
+WSGI config for exam_segment_django project.
+
+It exposes the WSGI callable as a module-level variable named ``application``.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
+"""
+
+import os
+
+from django.core.wsgi import get_wsgi_application
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'exam_segment_django.settings')
+
+application = get_wsgi_application()

+ 15 - 0
manage.py

@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+import os
+import sys
+
+if __name__ == '__main__':
+    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'exam_segment_django.settings')
+    try:
+        from django.core.management import execute_from_command_line
+    except ImportError as exc:
+        raise ImportError(
+            "Couldn't import Django. Are you sure it's installed and "
+            "available on your PYTHONPATH environment variable? Did you "
+            "forget to activate a virtual environment?"
+        ) from exc
+    execute_from_command_line(sys.argv)

+ 0 - 0
segment/__init__.py


+ 3 - 0
segment/admin.py

@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.

+ 5 - 0
segment/apps.py

@@ -0,0 +1,5 @@
+from django.apps import AppConfig
+
+
+class SegmentConfig(AppConfig):
+    name = 'segment'

+ 14 - 0
segment/exam_info/000000-template.xml

@@ -0,0 +1,14 @@
+<annotation>
+	<folder>JPEGImage</folder>
+	<filename>000001.jpg</filename>
+	<path>00</path>
+	<source>
+		<database>Unknown</database>
+	</source>
+	<size>
+		<width>1000</width>
+		<height>1000</height>
+		<depth>3</depth>
+	</size>
+	<segmented>0</segmented>
+</annotation>

+ 69 - 0
segment/form.py

@@ -0,0 +1,69 @@
+# @Author  : lightXu
+# @File    : form.py
+from django import forms
+
+SUBJECT_ID = ((0, '未知学科'),
+              (3, '数学'),
+              (6, '数学(知心慧学)'),
+              (8, '英语'),
+              (9, '语文'),
+              (12, '物理'),
+              (13, '化学'),
+              (14, '生物'),
+              (15, '政治'),
+              (16, '历史'),
+              (17, '地理'),
+              (18, '理综'),
+              (19, '文综'),
+              (98, '英语-B'),
+              (99, '英语-T'),
+              )
+
+
+class UploadImageForm(forms.Form):
+    subject_id_dict = SUBJECT_ID
+
+    subject = forms.ChoiceField(label='科目', label_suffix=':', widget=forms.Select(),
+                                choices=subject_id_dict, initial=3, required=True, )
+    img_data = forms.ImageField(label='试卷图片', label_suffix=':',
+                                widget=forms.ClearableFileInput(attrs={'multiple': True}))
+
+
+class UploadImageWithPaperIdForm(forms.Form):
+    subject_id_dict = SUBJECT_ID
+
+    subject = forms.ChoiceField(label='科目', label_suffix=':', widget=forms.Select(),
+                                choices=subject_id_dict, initial=3, required=True, )
+    paper_id = forms.CharField(label='PaperID', label_suffix=':',)
+    img_data = forms.ImageField(label='试卷图片', label_suffix=':',
+                                widget=forms.ClearableFileInput(attrs={'multiple': True}))
+
+
+class FormulaUrlForm(forms.Form):
+    img_url = forms.CharField(label='试卷URL', label_suffix=':')
+
+
+# class UploadFileForm(forms.Form):
+#     # xml_file = forms.FileField(label='XML', label_suffix=':',
+#     #                            widget=forms.ClearableFileInput(attrs={'multiple': True}))
+#     xml_file = forms.FileField(label='XML', label_suffix=':',
+#                                widget=forms.ClearableFileInput(attrs={'multiple': True}))
+
+
+class SubmitSeriesNumberForm(forms.Form):
+    series_number = forms.CharField(label='series_number', label_suffix=':', max_length=100)
+
+
+class DownloadImage(forms.Form):
+    paper_id = forms.CharField(label='paper_id', label_suffix=':', max_length=100)
+
+
+class UploadFileForm(forms.Form):
+    subject_id_dict = SUBJECT_ID
+
+    subject = forms.ChoiceField(label='科目', label_suffix=':', widget=forms.Select(),
+                                choices=subject_id_dict, initial=3, required=True, )
+    img_data = forms.FileField(label='试卷PDF', label_suffix=':',
+                               widget=forms.ClearableFileInput(attrs={'multiple': True}))
+
+

+ 3 - 0
segment/formula/__init__.py

@@ -0,0 +1,3 @@
+# @Author  : lightXu
+# @File    : __init__.py.py
+# @Time    : 2019/1/24 0024 上午 11:17

+ 273 - 0
segment/formula/formula_segment.py

@@ -0,0 +1,273 @@
+# @Author  : lightXu
+# @File    : formula_segment.py
+# @Time    : 2019/1/24 0024 下午 13:24
+import time
+import re
+import copy
+import math
+import cv2
+import numpy as np
+import xml.etree.cElementTree as ET
+from segment.formula import mathpix_ocr
+from segment.server import get_ocr_text_and_coordinate_formula
+from segment.image_operation import utils
+
+
+def get_coordinates(word_res, formula_words_list):
+    res_list = []
+    for formula_raw in formula_words_list:
+        coordinates_start_index = formula_raw[1][0]
+        coordinates_end_index = formula_raw[1][1] - 1
+        coordinates_start = word_res['chars'][coordinates_start_index]['location']
+        coordinates_end = word_res['chars'][coordinates_end_index]['location']
+        coordinates = (coordinates_start['left'],  # xmin
+                       min(coordinates_start['top'], coordinates_end['top']),  # ymin
+                       coordinates_end['left'] + coordinates_end['width'],  # xmax
+                       max(coordinates_start['top'] + coordinates_start['height'],
+                           coordinates_end['top'] + coordinates_end['height']))  # ymax
+        tmp_dict = {'chars': formula_raw[0],
+                    'raw_chars': formula_raw[0],
+                    'coordinates': coordinates,
+                    'middle': (coordinates[0] + int((coordinates[2] - coordinates[0]) // 2),
+                               coordinates[1] + int((coordinates[3] - coordinates[1]) // 2))}
+        res_list.append(tmp_dict)
+    return res_list
+
+
+def generate_char(words, index_pair, zh=True):
+    if index_pair:
+        # new_words = words.copy()
+        length = index_pair[1] - index_pair[0]
+        gen = ''
+        if zh:
+            for i in range(length):
+                gen = '中' + gen
+        else:
+            for i in range(length):
+                gen = 'F' + gen
+        words = words.replace(words[index_pair[0]:index_pair[1]], gen)
+        return words
+    else:
+        return words
+
+
+def segment(img, save_path, access_token):
+    # raw_img = img.copy()
+    # img = utils.preprocess(raw_img, None)
+
+    word_result_list = get_ocr_text_and_coordinate_formula(img, access_token)
+    formula_coordinates_dict_list = []
+    zh_coordinates_dict_list = []
+    zh_char_height = 20  # default
+    zh_char_width = 15  # default
+    zh_char_height_list = []
+    zh_char_width_list = []
+
+    exclude = r'{}|{}|{}|{}|{}|{}'.format(
+        '^[((]*[\d]+[))]',
+        '[ABCD]\.',
+        '[\u4e00-\u9fa5][,;:。,;:.]',
+        '[①②③④⑤⑥⑦⑧⑨⑩]',
+        '[((][))]',
+        '[\u4e00-\u9fa5][\d]+[\u4e00-\u9fa5]')
+
+    for index, word_res in enumerate(word_result_list):
+        words = word_res['words'].replace(' ', '').replace('兀', 'π')  # 去除空格,baidu_api bug
+        abcd_words_m = re.finditer(exclude, words)
+        abcd_index_list = [(m.group(), m.span()) for m in abcd_words_m if m]
+
+        words_tmp_zh = copy.copy(words)
+        for ele in abcd_index_list:
+            words_tmp_zh = generate_char(words_tmp_zh, ele[1], zh=True)
+
+        formula_words_m = re.finditer(r'[^\u4e00-\u9fa5._"“”]+', words_tmp_zh)
+        formula_index_list = [(m.group(), m.span()) for m in formula_words_m if m]
+        formula_list = get_coordinates(word_res, formula_index_list)
+        formula_coordinates_dict_list = formula_coordinates_dict_list + formula_list
+
+        words_tmp_formula = copy.copy(words)
+        for ele in abcd_index_list:
+            words_tmp_formula = generate_char(words_tmp_formula, ele[1], zh=False)
+        zh_words_m = re.finditer(r'[\u4e00-\u9fa5._"“”]+', words_tmp_formula)
+        zh_index_list = [(m.group(), m.span()) for m in zh_words_m if m]
+        zh_list = get_coordinates(word_res, zh_index_list + abcd_index_list)
+        zh_coordinates_dict_list = zh_coordinates_dict_list + zh_list
+
+        one_zh_char_m = re.match(r'[\u4e00-\u9fa5]+', words)
+        if one_zh_char_m:
+            index = one_zh_char_m.span()[0]
+            zh_char_height_list.append(word_res['chars'][index]['location']['height'])
+            zh_char_width_list.append(word_res['chars'][index]['location']['width'])
+
+    if len(zh_char_width_list) > 0 and len(zh_char_height_list) > 0:
+        zh_char_height = np.mean(zh_char_height_list)
+        zh_char_width = np.mean(zh_char_width_list)
+
+    formula_coordinates_list = [ele['coordinates'] for ele in formula_coordinates_dict_list]
+    formula_combine_list = combine(formula_coordinates_list, zh_char_height, zh_char_width)  # 欧式距离
+
+    formula_combine_dict_list = []
+    for i, ele in enumerate(formula_combine_list):
+        middle = (ele[0] + int((ele[2] - ele[0]) // 2), ele[1] + int((ele[3] - ele[1]) // 2))
+        ocr_region = utils.crop_region_direct(img, ele)
+        y, x = ocr_region.shape[0], ocr_region.shape[1]
+        if min(y, x) <= 50:
+            ocr_region = utils.resize_by_percent(ocr_region, 2.00)  # 放大若干倍
+        try:
+            mathpix_raw_chars, latex_confidence = mathpix_ocr.mathpix_api(ocr_region)  # 识别公式
+            render_mathpix_chars = '<latex>{}</latex>'.format(mathpix_raw_chars)
+            if latex_confidence < 0.2:
+                for item in formula_coordinates_dict_list:
+                    if ele == item['coordinates']:
+                        mathpix_raw_chars = item['chars']
+                        render_mathpix_chars = '<latex>{}</latex>'.format(item['chars'])
+                        break
+
+        except Exception:
+            render_mathpix_chars = 'formula'
+            mathpix_raw_chars = 'formula'
+            for item in formula_coordinates_dict_list:
+                if ele == item['coordinates']:
+                    mathpix_raw_chars = item['chars']
+                    render_mathpix_chars = '<latex>{}</latex>'.format(item['chars'])
+                    break
+
+        # print(render_mathpix_chars)
+        tmp_dict = {'chars': render_mathpix_chars, 'middle': middle, 'coordinates': ele, 'raw_chars': mathpix_raw_chars}
+        formula_combine_dict_list.append(tmp_dict)
+
+    # res_dict = {'formula': formula_combine_list, 'zh_chars': zh_coordinates_dict_list}
+    all_dict_list = formula_combine_dict_list + zh_coordinates_dict_list
+
+    all_dict_list = sorted(all_dict_list, key=lambda k: k.get('middle')[1])
+
+    # 相邻y做差
+    former = np.array([ele['middle'][1] for ele in all_dict_list[:-1]])
+    rear = np.array([ele['middle'][1] for ele in all_dict_list[1:]])
+    dif = rear - former
+    split_x_index = [index for index, ele in enumerate(dif) if ele >= zh_char_height]  # y轴排序
+
+    if not split_x_index:
+        all_dict_list = sorted(all_dict_list, key=lambda k: k.get('middle')[0])  # x轴排序
+        lines = [ele['chars'] for ele in all_dict_list]
+        raw_lines = [ele['raw_chars'] for ele in all_dict_list]
+        return lines, raw_lines
+    else:
+        res_list = []
+        split_x_index = [ele + 1 for ele in split_x_index]  # 索引值扩大
+        split_x_index.insert(0, 0)
+        split_x_index.insert(-1, len(all_dict_list))
+        split_x_index = sorted(list(set(split_x_index)))
+        for i, split in enumerate(split_x_index[1:]):
+            one_line = all_dict_list[split_x_index[i]:split_x_index[i + 1]]
+            one_line = sorted(one_line, key=lambda k: k.get('middle')[0])  # x轴排序
+            res_list.append(one_line)
+
+        lines = []
+        raw_lines = []
+        for ele in res_list:
+            line_chars = ''
+            raw_lines_chars = ''
+            for ele1 in ele:
+                chars = ele1['chars']
+                raw_chars = ele1['raw_chars']
+                line_chars = line_chars + chars
+                raw_lines_chars = raw_lines_chars + raw_chars
+
+            lines.append(line_chars + '\n')
+            raw_lines.append(raw_lines_chars + '\n')
+        # print(lines)
+
+        return lines, raw_lines
+
+
+def combine(formula_coordinates_list, zh_char_height, zh_char_width):
+    formula_coordinates_list = sorted(formula_coordinates_list, key=lambda k: k[0])
+    formula_coordinates_list = sorted(formula_coordinates_list, key=lambda k: k[1])  # 先x轴,再y轴排序
+
+    recursion_flag = False
+    del_list = []
+    temp_list = formula_coordinates_list.copy()
+
+    for i, outer in enumerate(temp_list):  # xmin, ymin, xmax, ymax
+        for j, inner in enumerate(temp_list):  # xmin, ymin, xmax, ymax
+            if not i == j:
+                min_distance, flag = get_min_distance(outer, inner)
+                combine_coordinate = ()
+                if flag == 'i':
+                    recursion_flag = True
+                    combine_coordinate = (min(outer[0], inner[0]), min(outer[1], inner[1]),
+                                          max(outer[2], inner[2]), max(outer[3], inner[3]))
+                elif flag == 'h' and min_distance <= 1:
+                    recursion_flag = True
+                    combine_coordinate = (min(outer[0], inner[0]), min(outer[1], inner[1]),
+                                          max(outer[2], inner[2]), max(outer[3], inner[3]))
+                elif flag == 'w' and min_distance <= zh_char_width:
+                    recursion_flag = True
+                    combine_coordinate = (min(outer[0], inner[0]), min(outer[1], inner[1]),
+                                          max(outer[2], inner[2]), max(outer[3], inner[3]))
+                elif flag == 'c' and min_distance <= 1:
+                    recursion_flag = True
+                    combine_coordinate = (min(outer[0], inner[0]), min(outer[1], inner[1]),
+                                          max(outer[2], inner[2]), max(outer[3], inner[3]))
+
+                if combine_coordinate:
+                    if not combine_coordinate == outer and not combine_coordinate == inner:  # 避免全包围的情况
+                        del_list.append(outer)
+                        del_list.append(inner)
+                    if combine_coordinate == outer:
+                        del_list.append(inner)
+                    if combine_coordinate == inner:
+                        del_list.append(outer)
+                    formula_coordinates_list.append(combine_coordinate)
+
+    res = list(set(formula_coordinates_list) - set(del_list))
+
+    if recursion_flag:
+        return combine(res, zh_char_height, zh_char_width)
+    else:
+        return res
+
+
+def get_min_distance_square(coordinate1, coordinate2):  # 顶点间欧式距离最小值的平方和
+    all_points1 = [(x, y) for x in [coordinate1[0], coordinate1[2]] for y in [coordinate1[1], coordinate1[3]]]
+    all_points2 = [(x, y) for x in [coordinate2[0], coordinate2[2]] for y in [coordinate2[1], coordinate2[3]]]
+    distance_list = []
+    for index1, point1 in enumerate(all_points1):
+        for index2, point2 in enumerate(all_points2):
+            distance = (point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2
+            distance_list.append(distance)
+    min_distance = min(distance_list)
+    return min_distance
+
+
+def get_min_distance(coordinate1, coordinate2):  # 欧式距离最小值
+
+    def dist(point1, point2):
+        distance = (point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2
+        return math.sqrt(distance)
+
+    (x1, y1, x1b, y1b) = coordinate1
+    (x2, y2, x2b, y2b) = coordinate2
+    left = x2b < x1  # 2在1的坐标左边
+    right = x1b < x2  # 2在1的坐标右边
+    bottom = y2b < y1  # 2在1的坐标下边
+    top = y1b < y2  # 2在1的坐标上边
+    if top and left:
+        return dist((x1, y1b), (x2b, y2)), 'c'
+    elif left and bottom:
+        return dist((x1, y1), (x2b, y2b)), 'c'
+    elif bottom and right:
+        return dist((x1b, y1), (x2, y2b)), 'c'
+    elif right and top:
+        return dist((x1b, y1b), (x2, y2)), 'c'
+    elif left:
+        return x1 - x2b, 'w'
+    elif right:
+        return x2 - x1b, 'w'
+    elif bottom:
+        return y1 - y2b, 'h'
+    elif top:
+        return y2 - y1b, 'h'
+    else:             # rectangles intersect
+        return 0, 'i'

+ 321 - 0
segment/formula/formula_segment_and_show.py

@@ -0,0 +1,321 @@
+# @Author  : lightXu
+# @File    : formula_segment_and_show.py
+# @Time    : 2019/1/24 0024 下午 13:24
+import time
+import re
+import copy
+import math
+import cv2
+import numpy as np
+import xml.etree.cElementTree as ET
+from segment.formula import mathpix_ocr
+from segment.server import get_ocr_text_and_coordinate_formula
+from segment.image_operation import utils
+
+
+def get_coordinates(word_res, formula_words_list):
+    res_list = []
+    for formula_raw in formula_words_list:
+        coordinates_start_index = formula_raw[1][0]
+        coordinates_end_index = formula_raw[1][1] - 1
+        coordinates_start = word_res['chars'][coordinates_start_index]['location']
+        coordinates_end = word_res['chars'][coordinates_end_index]['location']
+        coordinates = (coordinates_start['left'],  # xmin
+                       min(coordinates_start['top'], coordinates_end['top']),  # ymin
+                       coordinates_end['left'] + coordinates_end['width'],  # xmax
+                       max(coordinates_start['top'] + coordinates_start['height'],
+                           coordinates_end['top'] + coordinates_end['height']))  # ymax
+        tmp_dict = {'chars': formula_raw[0],
+                    'raw_chars': formula_raw[0],
+                    'coordinates': coordinates,
+                    'middle': (coordinates[0] + int((coordinates[2] - coordinates[0]) // 2),
+                               coordinates[1] + int((coordinates[3] - coordinates[1]) // 2))}
+        res_list.append(tmp_dict)
+    return res_list
+
+
+def generate_char(words, index_pair, zh=True):
+    if index_pair:
+        # new_words = words.copy()
+        length = index_pair[1] - index_pair[0]
+        gen = ''
+        if zh:
+            for i in range(length):
+                gen = '中' + gen
+        else:
+            for i in range(length):
+                gen = 'F' + gen
+        words = words.replace(words[index_pair[0]:index_pair[1]], gen)
+        return words
+    else:
+        return words
+
+
+def segment(img, save_path, access_token):
+    # raw_img = img.copy()
+    # img = utils.preprocess(raw_img, None)
+
+    word_result_list = get_ocr_text_and_coordinate_formula(img, access_token)
+    formula_coordinates_dict_list = []
+    zh_coordinates_dict_list = []
+    zh_char_height = 20  # default
+    zh_char_width = 15  # default
+    zh_char_height_list = []
+    zh_char_width_list = []
+
+    exclude = r'{}|{}|{}|{}|{}|{}'.format(
+        '[ABCD]\.',  # A. B. C. D.
+        '[((][))]',  # ()
+        '^[((]*[\d]+[))]',  # (1)
+        # '[((]*[a-zA-Z]{2,}[))]',  # (km), (kg)
+        '[①②③④⑤⑥⑦⑧⑨⑩]',  # ①②③④⑤⑥⑦⑧⑨⑩
+        '[\u4e00-\u9fa5][,;:。,;:.]',  # 中.
+        '[\u4e00-\u9fa5][\d]+[\u4e00-\u9fa5]')  # 中123中
+
+    for index, word_res in enumerate(word_result_list):
+        words = word_res['words'].replace(' ', '').replace('兀', 'π')  # 去除空格,baidu_api bug
+
+        abcd_words_m = re.finditer(exclude, words)
+        abcd_index_list = [(m.group(), m.span()) for m in abcd_words_m if m]
+
+        words_tmp_zh = copy.copy(words)
+        for ele in abcd_index_list:
+            words_tmp_zh = generate_char(words_tmp_zh, ele[1], zh=True)
+
+        formula_words_m = re.finditer(r'[^\u4e00-\u9fa5_"“”]+', words_tmp_zh)
+        formula_index_list = [(m.group(), m.span()) for m in formula_words_m if m]
+        formula_list = get_coordinates(word_res, formula_index_list)
+        formula_coordinates_dict_list = formula_coordinates_dict_list + formula_list
+
+        words_tmp_formula = copy.copy(words)
+        for ele in abcd_index_list:
+            words_tmp_formula = generate_char(words_tmp_formula, ele[1], zh=False)
+        zh_words_m = re.finditer(r'[\u4e00-\u9fa5_"“”]+', words_tmp_formula)
+        zh_index_list = [(m.group(), m.span()) for m in zh_words_m if m]
+        zh_list = get_coordinates(word_res, zh_index_list + abcd_index_list)
+        zh_coordinates_dict_list = zh_coordinates_dict_list + zh_list
+
+        one_zh_char_m = re.match(r'[\u4e00-\u9fa5]+', words)
+        if one_zh_char_m:
+            index = one_zh_char_m.span()[0]
+            zh_char_height_list.append(word_res['chars'][index]['location']['height'])
+            zh_char_width_list.append(word_res['chars'][index]['location']['width'])
+
+    if len(zh_char_width_list) > 0 and len(zh_char_height_list) > 0:
+        zh_char_height = np.mean(zh_char_height_list)
+        zh_char_width = np.mean(zh_char_width_list)
+
+    formula_coordinates_list = [ele['coordinates'] for ele in formula_coordinates_dict_list]
+
+    temp_img = img.copy()
+    for ele in formula_coordinates_list:
+        cv2.rectangle(temp_img, (int(ele[0]), int(ele[1])), (int(ele[2]), int(ele[3])), (0, 255, 0), 1)
+    save_path0 = save_path.replace('.jpg', '_@_{:02d}.jpg'.format(1))
+    utils.write_single_img(temp_img, save_path0)
+
+    # 合并公式
+    formula_combine_list = combine(img, save_path, formula_coordinates_list, zh_char_height, zh_char_width, 1)  # 欧式距离
+
+    formula_combine_dict_list = []
+    for ele in formula_combine_list:
+        middle = (ele[0] + int((ele[2] - ele[0]) // 2), ele[1] + int((ele[3] - ele[1]) // 2))
+        ocr_region = utils.crop_region_direct(img, ele)
+        y, x = ocr_region.shape[0], ocr_region.shape[1]
+        if min(y, x) <= 50:
+            ocr_region = utils.resize_by_percent(ocr_region, 1.50)  # 放大若干倍
+            # cv2.imshow('region', ocr_region)
+            # if cv2.waitKey(0) == 27:
+            #     cv2.destroyAllWindows()
+        try:
+
+            mathpix_raw_chars, latex_confidence = mathpix_ocr.mathpix_api(ocr_region)  # 识别公式
+            render_mathpix_chars = '<img src="http://latex.codecogs.com/png.latex?{}" />'.format(mathpix_raw_chars)
+            if latex_confidence < 0.2 or mathpix_raw_chars == '' or len(mathpix_raw_chars) == 1:
+                for item in formula_coordinates_dict_list:
+                    if ele == item['coordinates']:
+                        mathpix_raw_chars = item['chars']
+                        render_mathpix_chars = '<img src="http://latex.codecogs.com/png.latex?{}" />' \
+                            .format(item['chars'])
+                        break
+        except Exception:
+            render_mathpix_chars = 'formula'
+            mathpix_raw_chars = 'formula'
+            for item in formula_coordinates_dict_list:
+                if ele == item['coordinates']:
+                    mathpix_raw_chars = item['chars']
+                    render_mathpix_chars = '<img src="http://latex.codecogs.com/png.latex?{}" />' \
+                        .format(item['chars'])
+                    break
+
+        print(render_mathpix_chars)
+        tmp_dict = {'chars': render_mathpix_chars, 'middle': middle, 'coordinates': ele, 'raw_chars': mathpix_raw_chars}
+        formula_combine_dict_list.append(tmp_dict)
+
+    # res_dict = {'formula': formula_combine_list, 'zh_chars': zh_coordinates_dict_list}
+    all_dict_list = zh_coordinates_dict_list + formula_combine_dict_list
+    all_dict_list = sorted(all_dict_list, key=lambda k: k.get('middle')[1])
+
+    # 相邻y做差
+    former = np.array([ele['middle'][1] for ele in all_dict_list[:-1]])
+    rear = np.array([ele['middle'][1] for ele in all_dict_list[1:]])
+    dif = rear - former
+    split_x_index = [index for index, ele in enumerate(dif) if ele >= zh_char_height]  # y轴排序
+
+    # 对整体图像大小进行resize
+
+    scale = 1
+    h, w = img.shape[0], img.shape[1]
+    if w > 1000:
+        scale = float(1000 / w)
+    elif h < 100:
+        scale = float(100 / h)
+
+    img_resize = utils.resize_by_percent(img, scale)
+    utils.write_single_img(img_resize, save_path)
+
+    if not split_x_index:
+        all_dict_list = sorted(all_dict_list, key=lambda k: k.get('middle')[0])  # x轴排序
+        lines = [ele['chars'] for ele in all_dict_list]
+        raw_lines = [ele['raw_chars'] for ele in all_dict_list]
+
+        for ele in all_dict_list:
+            bbox = [box * scale for box in ele['coordinates']]
+            cv2.rectangle(img_resize, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 255, 0), 3)
+        utils.write_single_img(img_resize, save_path)
+
+        return lines, raw_lines, h
+    else:
+        res_list = []
+        split_x_index = [ele + 1 for ele in split_x_index]  # 索引值扩大
+        split_x_index.insert(0, 0)
+        split_x_index.insert(-1, len(all_dict_list))
+        split_x_index = sorted(list(set(split_x_index)))
+        for i, split in enumerate(split_x_index[1:]):
+            one_line = all_dict_list[split_x_index[i]:split_x_index[i + 1]]
+            one_line = sorted(one_line, key=lambda k: k.get('middle')[0])  # x轴排序
+            res_list.append(one_line)
+
+        lines = []
+        raw_lines = []
+        for ele in res_list:
+            line_chars = ''
+            raw_lines_chars = ''
+            for ele1 in ele:
+                chars = ele1['chars']
+                raw_chars = ele1['raw_chars']
+                line_chars = line_chars + chars
+                raw_lines_chars = raw_lines_chars + raw_chars
+
+                bbox = [box * scale for box in ele1['coordinates']]
+                cv2.rectangle(img_resize, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 255, 0), 1)
+
+            lines.append(line_chars + '\n')
+            raw_lines.append(raw_lines_chars + '\n')
+        utils.write_single_img(img_resize, save_path)
+        # print(lines)
+
+        return lines, raw_lines, h
+
+
+def combine(img, save_path, formula_coordinates_list, zh_char_height, zh_char_width, draw_index):
+    img_draw = img.copy()
+    formula_coordinates_list = sorted(formula_coordinates_list, key=lambda k: k[0])
+    formula_coordinates_list = sorted(formula_coordinates_list, key=lambda k: k[1])  # 先x轴,再y轴排序
+
+    recursion_flag = False
+    del_list = []
+    temp_list = formula_coordinates_list.copy()
+
+    for i, outer in enumerate(temp_list):  # xmin, ymin, xmax, ymax
+        for j, inner in enumerate(temp_list):  # xmin, ymin, xmax, ymax
+            if not i == j:
+                min_distance, flag = get_min_distance(outer, inner)
+                combine_coordinate = ()
+                if flag == 'i':
+                    recursion_flag = True
+                    combine_coordinate = (min(outer[0], inner[0]), min(outer[1], inner[1]),
+                                          max(outer[2], inner[2]), max(outer[3], inner[3]))
+                elif flag == 'h' and min_distance <= 1:
+                    recursion_flag = True
+                    combine_coordinate = (min(outer[0], inner[0]), min(outer[1], inner[1]),
+                                          max(outer[2], inner[2]), max(outer[3], inner[3]))
+                elif flag == 'w' and min_distance <= zh_char_width*2//3:
+                    recursion_flag = True
+                    combine_coordinate = (min(outer[0], inner[0]), min(outer[1], inner[1]),
+                                          max(outer[2], inner[2]), max(outer[3], inner[3]))
+                elif flag == 'c' and min_distance <= 1:
+                    recursion_flag = True
+                    combine_coordinate = (min(outer[0], inner[0]), min(outer[1], inner[1]),
+                                          max(outer[2], inner[2]), max(outer[3], inner[3]))
+
+                if combine_coordinate:
+                    if not combine_coordinate == outer and not combine_coordinate == inner:  # 避免全包围的情况
+                        del_list.append(outer)
+                        del_list.append(inner)
+                    if combine_coordinate == outer:
+                        del_list.append(inner)
+                    if combine_coordinate == inner:
+                        del_list.append(outer)
+                    formula_coordinates_list.append(combine_coordinate)
+
+    res = list(set(formula_coordinates_list) - set(del_list))
+
+    if recursion_flag:
+        draw_index = draw_index + 1
+        for ele in res:
+            cv2.rectangle(img_draw, (int(ele[0]), int(ele[1])), (int(ele[2]), int(ele[3])), (0, 255, 0), 1)
+
+        save_path_temp = save_path.replace('.jpg', '_@_{:02d}.jpg'.format(draw_index))
+        utils.write_single_img(img_draw, save_path_temp)
+        return combine(img, save_path, res, zh_char_height, zh_char_width, draw_index)
+    else:
+        for ele in res:
+            cv2.rectangle(img_draw, (int(ele[0]), int(ele[1])), (int(ele[2]), int(ele[3])), (0, 255, 0), 1)
+
+        save_path_temp = save_path.replace('.jpg', '_@_final.jpg')
+        utils.write_single_img(img_draw, save_path_temp)
+        return res
+
+
+def get_min_distance_square(coordinate1, coordinate2):  # 顶点间欧式距离最小值的平方和
+    all_points1 = [(x, y) for x in [coordinate1[0], coordinate1[2]] for y in [coordinate1[1], coordinate1[3]]]
+    all_points2 = [(x, y) for x in [coordinate2[0], coordinate2[2]] for y in [coordinate2[1], coordinate2[3]]]
+    distance_list = []
+    for index1, point1 in enumerate(all_points1):
+        for index2, point2 in enumerate(all_points2):
+            distance = (point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2
+            distance_list.append(distance)
+    min_distance = min(distance_list)
+    return min_distance
+
+
+def get_min_distance(coordinate1, coordinate2):  # 欧式距离最小值
+
+    def dist(point1, point2):
+        distance = (point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2
+        return math.sqrt(distance)
+
+    (x1, y1, x1b, y1b) = coordinate1
+    (x2, y2, x2b, y2b) = coordinate2
+    left = x2b < x1  # 2在1的坐标左边
+    right = x1b < x2  # 2在1的坐标右边
+    bottom = y2b < y1  # 2在1的坐标下边
+    top = y1b < y2  # 2在1的坐标上边
+    if top and left:
+        return dist((x1, y1b), (x2b, y2)), 'c'
+    elif left and bottom:
+        return dist((x1, y1), (x2b, y2b)), 'c'
+    elif bottom and right:
+        return dist((x1b, y1), (x2, y2b)), 'c'
+    elif right and top:
+        return dist((x1b, y1b), (x2, y2)), 'c'
+    elif left:
+        return x1 - x2b, 'w'
+    elif right:
+        return x2 - x1b, 'w'
+    elif bottom:
+        return y1 - y2b, 'h'
+    elif top:
+        return y2 - y1b, 'h'
+    else:  # rectangles intersect
+        return 0, 'i'

+ 32 - 0
segment/formula/mathpix_ocr.py

@@ -0,0 +1,32 @@
+import base64
+import requests
+import json
+import cv2
+import numpy as np
+
+
+def opecv2base64(img):
+    image = cv2.imencode('.jpg', img)[1]
+    base64_data = str(base64.b64encode(image))[2:-1]
+    return base64_data
+
+
+def mathpix_api(img):
+    image = opecv2base64(img)
+    image_uri = "data:image/jpg;base64," + image
+    r = requests.post("https://api.mathpix.com/v3/latex",
+                      data=json.dumps({'src': image_uri,
+                                       'formats': ['latex_normal', 'latex_styled']}),
+                      headers={"app_id": "1092963746_qq_com", "app_key": "0c3b77b0c3720175e0ba",
+                               "Content-type": "application/json"},
+                      timeout=7).json()
+    res = r['latex_styled']
+    latex_confidence = r['latex_confidence']
+    # print(res)
+    return res, latex_confidence
+
+
+if __name__ == '__main__':
+    img_path0 = r'F:\save\img_withbgm\0003.png'
+    img0 = np.asarray(cv2.imread(img_path0))
+    mathpix_api(img0)

+ 2 - 0
segment/image_operation/__init__.py

@@ -0,0 +1,2 @@
+# @Author  : lightXu
+# @File    : __init__.py.py

+ 538 - 0
segment/image_operation/exam_segment.py

@@ -0,0 +1,538 @@
+import re
+import json
+import glob
+import os
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+problem_number_pattern = re.compile(r'\s*(\d+)')
+number_pattern = re.compile(r'(\d+)')
+sub_problem_number_pattern = re.compile(r'\s*\((\d+)')
+
+max_number = 99     # 最大题目数
+min_number = 0      # 最小题目数
+
+
+def get_respond_from_json(json_file):
+    with open(json_file, 'r', encoding='UTF-8') as f:
+        resp = json.load(f)
+    return resp
+
+
+def get_number_position(words_result, max_number=max_number, left_position=0, right_position=0):
+    # 获取以数字开头的位置, 保留num<=max_number以及字符位置位于[left_position,right_position]的数
+    numbers = []
+    for line_index in range(len(words_result)):
+        line = words_result[line_index]
+        #print('**************************************')
+        #print(line['words'])
+        #print(line['chars'][:2])
+        m = problem_number_pattern.match(line['words'])
+        if m:
+            location = line['chars'][m.start(1)]['location']
+            number = line['words'][m.start(1):m.end(1)]
+            center = location['left'] + location['width'] // 2
+            if int(number) <= max_number and center >= left_position:
+                if right_position == 0:
+                    numbers.append(
+                        {'number': number, 'center': center, 'line': line_index, 'location': line['location']})
+                elif center <= right_position:
+                    numbers.append(
+                        {'number': number, 'center': center, 'line': line_index, 'location': line['location']})
+
+            #print(number, center, location)
+            #print(line['chars'][m.start(1)])
+    return numbers
+
+
+def get_number_list(numbers, shift_limit=50):
+    # 获取横坐标相近的数字序列
+    number_list = []
+
+    for number in numbers:
+        not_found_flag = 1
+        for single_list in number_list:
+            if abs(number['center']-single_list[-1]['center']) <= shift_limit:
+                single_list.append(number)
+                not_found_flag = 0
+                #break
+        if not_found_flag:
+            single_list = []
+            single_list.append(number)
+            number_list.append(single_list)
+
+    return number_list
+
+
+def get_longest_sequence(sequence, limit, type='l'):
+    #   获取limit之下或之上的最长连续序列
+    flag = [[0, 0], [0, 0]]
+    for i in range(len(sequence)):
+        if type == 'l':
+            f = sequence[i] <= limit
+        elif type == 'h':
+            f = sequence[i] >= limit
+        if f:
+            if i == flag[1][1]:
+                flag[1][1] += 1
+            else:
+                if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]:
+                    flag[0][:] = flag[1][:]
+                flag[1][:] = [i, i + 1]
+    if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]:
+        flag[0][:] = flag[1][:]
+    return flag[0][:]
+
+
+def get_number_sequence(numbers, max_gap=5, min_number=min_number):
+    # 数列连续性判断
+    number_sequence = []
+    return number_sequence
+
+
+def get_problem_list(number_list):
+    # 选取题号序列
+    # rule1: 横坐标最小
+    # rule2: 序列连续性?
+    # rule3: 整体题号连续性?
+
+    if number_list:
+        index = 0
+        left = number_list[index][0]['center']
+    else:
+        return []
+
+    for i in range(1, len(number_list)):
+        if number_list[i][0]['center'] < left:
+            index = i
+            left = number_list[i][0]['center']
+    return number_list[index]
+
+
+def get_double_page_number(words_result, img_width, left_ratio=0.4, right_ratio=0.6):
+    left = int(left_ratio * img_width)
+    right = int(right_ratio * img_width)
+    numbers = []
+    for line in words_result:
+        for char in line['chars']:
+            center = int(char['location']['left']) + int(char['location']['width']) // 2
+            if number_pattern.match(char['char']) and left <= center <= right:
+                char.update(center=center)
+                numbers.append(char)
+    double_page_numbers = get_number_list(numbers)
+    for d in double_page_numbers:
+        if len(d) >= 2:
+            return True, double_page_numbers
+    return double_page_numbers
+
+
+def image_projection(image, left_ratio, right_ratio, top_ratio=0.2, bottom_ratio=0.9, gap=20):
+    #   图像投影projection = [counts, positions]
+    image = np.asarray(image)
+    image = 255 - image
+    height = image.shape[0]
+    width = image.shape[1]
+    top = int(height * top_ratio)
+    bottom = int(height * bottom_ratio)
+    left = int(width * left_ratio)
+    right = int(width * right_ratio)
+    # col_num = (right - left + 1) // gap
+    # right = left + col_num * gap
+
+    projection = np.zeros((2, len(range(left, right-gap, gap))), dtype=np.int)
+    projection[1, :] = np.asarray(range(left, right-gap, gap), dtype=np.int)
+
+    projection[0, :] = np.sum(np.sum(np.hsplit(
+        image[top:bottom, left:projection[1, -1]+gap], projection.shape[1]), axis=1), axis=1) // (bottom - top)
+    return projection
+
+
+def word_projection(words_result, image_shape, left_ratio, right_ratio, top_ratio=0.2, bottom_ratio=0.9, gap=20):
+    #   字符投影word_count = [counts, positions]
+    height = image_shape[0]
+    width = image_shape[1]
+    left = int(width * left_ratio)
+    right = int(width * right_ratio)
+    top = int(height * top_ratio)
+    bottom = int(height * bottom_ratio)
+    word_count = np.zeros((2, len(range(left, right-gap, gap))), dtype=np.int)
+    word_count[1, :] = np.asarray(range(left, right-gap, gap), dtype=np.int)
+
+    for line in words_result:
+        if top < line['location']['top'] < bottom:
+            for char in line['chars']:
+                center = char['location']['left'] + char['location']['width'] // 2
+                for i in range(word_count.shape[1]):
+                    if 0 <= center - word_count[1, i] < gap:
+                        word_count[0, i] += 1
+
+    return word_count
+
+
+def check_seal_line(words_result, image, type='left', gap=20):
+    #   检查是否有密封线,返回密封线横坐标
+    projection_limit = 80
+    wc_limit = 0
+    seal_limit = 3
+
+    image = np.asarray(image)
+    height, width = image.shape[:2]
+    if height / width < 1:
+        if type == 'left':
+            #   检查左密封线
+            length_limit = 5
+
+            left_ratio = 0
+            right_ratio = 0.15
+            word_count = word_projection(
+                words_result, (height, width), left_ratio=left_ratio, right_ratio=right_ratio, gap=gap)
+            image_count = image_projection(image, left_ratio=left_ratio, right_ratio=right_ratio, gap=gap)
+            seal_flag = np.sum(image_count[0, :length_limit] > projection_limit)
+
+            if seal_flag < seal_limit:
+                # 判定无密封线
+                return 0
+            else:
+                #   获取数字开头的位置
+                numbers = get_number_position(
+                    words_result, left_position=length_limit*gap, right_position=right_ratio*width)
+                right_flag = right_ratio * width
+                for number in numbers:
+                    right_flag = min(right_flag, number['center'])
+                for i in range(word_count.shape[1]-1, -1, -1):
+                    if word_count[0, i] <= wc_limit:
+                        if length_limit*gap <= word_count[1, i] <= right_flag:
+                            return word_count[1, i]
+                return length_limit * gap
+        elif type == 'right':
+            #   检查右密封线
+
+            left_ratio = 0.85
+            right_ratio = 1
+            word_count = word_projection(words_result, (height, width), left_ratio=left_ratio, right_ratio=right_ratio)
+            image_count = image_projection(image, left_ratio=left_ratio, right_ratio=right_ratio)
+
+            # seal_flag = np.sum(image_count[0, -length_limit:] > projection_limit)
+            # if seal_flag < seal_limit:
+            #     return 0
+            # else:
+            #     for i in range(word_count.shape[1]-length_limit, -1, -1):
+            #         if word_count[0, i] > wc_limit and image_count[0, i] <= projection_limit:
+            #             return word_count[1, i] + 2 * gap
+            #     return width - length_limit * gap
+            for i in range(word_count.shape[1]-1, -1, -1):
+                if word_count[0, i] > wc_limit:
+                    if image_count[0, i-1] <= projection_limit and word_count[0, i-1] + word_count[0, i-2] > 0:
+                        return word_count[1, i-1] + gap
+
+            return 0
+    else:
+        return 0
+
+
+def check_double_page(words_result, image, height_to_width_ratio=1, wc_limit=2):
+    #   检查是否有分页, 返回分割线横坐标
+    image = np.asarray(image)
+    height = image.shape[0]
+    width = image.shape[1]
+    flag = [[0, 0], [0, 0]]
+
+    if height / width < height_to_width_ratio:
+        word_count = word_projection(words_result, (height, width), left_ratio=0.4, right_ratio=0.6)
+        for i in range(word_count.shape[1]):
+            if word_count[0, i] <= wc_limit:
+                if i == flag[1][1]:
+                    flag[1][1] += 1
+                else:
+                    if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]:
+                        flag[0][:] = flag[1][:]
+                    flag[1][:] = [i, i+1]
+        if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]:
+            return word_count[1, (flag[1][0]+flag[1][1])//2]
+        elif flag[0][1]:
+            return word_count[1, (flag[0][0] + flag[0][1]) // 2]
+        else:
+            return 0
+    return 0
+    #     for i in range(word_count.shape[1]//2):
+    #         kplus = word_count.shape[1]//2 + i
+    #         kminus = word_count.shape[1]//2 - i
+    #         if word_count[0, kplus] <= wc_limit:
+    #             return word_count[1, kplus]
+    #         elif word_count[0, kminus] <= wc_limit:
+    #             return word_count[1, kminus]
+    # return 0
+
+
+def get_line_from_chars(chars):
+    #   从一行所有字符获取行的整体坐标
+    if chars:
+        xmin = chars[0]['location']['left']
+        ymin = chars[0]['location']['top']
+        xmax = chars[0]['location']['left'] + chars[0]['location']['width']
+        ymax = chars[0]['location']['top'] + chars[0]['location']['height']
+        for char in chars:
+            if xmin > char['location']['left']:
+                xmin = char['location']['left']
+            if ymin > char['location']['top']:
+                ymin = char['location']['top']
+            if xmax < char['location']['left'] + char['location']['width']:
+                xmax = char['location']['left'] + char['location']['width']
+            if ymax < char['location']['top'] + char['location']['height']:
+                ymax = char['location']['top'] + char['location']['height']
+        result = {'width': xmax-xmin, 'top': ymin, 'left': xmin, 'height': ymax-ymin}
+        return result
+    else:
+        return {}
+
+
+def get_box_from_lines(lines):
+    #   获取包含所有行区域的整体坐标
+    if lines:
+        ymin = lines[0]['location']['top']
+        ymax = lines[0]['location']['top'] + lines[0]['location']['height']
+        xmin = lines[0]['location']['left']
+        xmax = lines[0]['location']['left'] + lines[0]['location']['width']
+        for line in lines:
+            if xmin > line['location']['left']:
+                xmin = line['location']['left']
+            if ymin > line['location']['top']:
+                ymin = line['location']['top']
+            if xmax < line['location']['left'] + line['location']['width']:
+                xmax = line['location']['left'] + line['location']['width']
+            if ymax < line['location']['top'] + line['location']['height']:
+                ymax = line['location']['top'] + line['location']['height']
+        return {'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax}
+    else:
+        return {}
+
+
+def split_line_for_double_pages(line, split_position):
+    #   把单行按双页分割
+    char_pattern = r'\s*\S'
+    words_pattern = r''
+    odd_page_line = {}
+    even_page_line = {}
+    odd_page_chars = []
+    even_page_chars = []
+    for char in line['chars']:
+        center = char['location']['left'] + char['location']['width'] // 2
+        if center <= split_position:
+            odd_page_chars.append(char)
+        else:
+            even_page_chars.append(char)
+    words_length = len(odd_page_chars)
+    if words_length == 0:
+        even_page_line = line
+    elif len(even_page_chars) == 0:
+        odd_page_line = line
+    else:
+        odd_page_location = get_line_from_chars(odd_page_chars)
+        even_page_location = get_line_from_chars(even_page_chars)
+        for i in range(words_length):
+            words_pattern += char_pattern
+        words_pattern = re.compile(words_pattern)   # ubuntu上有问题
+        match = words_pattern.match(line['words'])
+        odd_page_words = match[0]
+        even_page_words = line['words'][match.end():]
+        odd_page_line = {'chars': odd_page_chars, 'location': odd_page_location, 'words': odd_page_words}
+        if even_page_words:
+            even_page_line = {'chars': even_page_chars, 'location': even_page_location, 'words': even_page_words}
+    return odd_page_line, even_page_line
+
+
+def get_double_page_text(words_result, split_position):
+    #   把文本按双页分割
+    odd_page = []
+    even_page = []
+    for line in words_result:
+        if line['location']['left'] + line['location']['width'] // 2 >= split_position:
+            even_page.append(line)
+        else:
+            odd_page.append(line)
+        # else:
+        #     odd_page_line, even_page_line = split_line_for_double_pages(line, split_position)
+        #     if odd_page_line:
+        #         odd_page.append(odd_page_line)
+        #     if even_page_line:
+        #         even_page.append(even_page_line)
+    return [odd_page, even_page]
+
+
+# def get_double_page_text(words_result, split_position):
+#     odd_page = []
+#     even_page = []
+#     for line in words_result:
+#         odd_page_chars = []
+#         even_page_chars = []
+#         for char in line['chars']:
+#             center = char['location']['left'] + char['location']['width'] // 2
+#             if center <= split_position:
+#                 odd_page_chars.append(char)
+#             else:
+#                 even_page_chars.append(char)
+#         line_result = get_line_from_chars(odd_page_chars)
+#         if line_result:
+#             odd_page.append(line_result)
+#         line_result = get_line_from_chars(even_page_chars)
+#         if line_result:
+#             even_page.append(line_result)
+#     return [odd_page, even_page]
+
+
+def get_page_text(words_result, image):
+    #   除去密封线,分页,获取页面文本结果
+    left_seal_line = check_seal_line(words_result, image, type='left')
+    if left_seal_line:
+        words_result = get_double_page_text(words_result, left_seal_line)[1]
+    right_seal_line = check_seal_line(words_result, image, type='right')
+    if right_seal_line:
+        words_result = get_double_page_text(words_result, right_seal_line)[0]
+    split_position = check_double_page(words_result, image)
+    if split_position:
+        return get_double_page_text(words_result, split_position)
+    else:
+        return [words_result]
+
+
+def exam_segment(words_result):
+    #   分割试卷区域
+    numbers = get_number_position(words_result)
+    number_list = get_number_list(numbers)
+    group_list = get_problem_list(number_list)
+
+    for i in range(len(group_list)-1):
+        group_list[i].update(end_line=group_list[i+1]['line']-1)
+    if len(group_list) >= 1:
+        group_list[-1].update(end_line=len(words_result)-1)
+    for g in group_list:
+        ymin = g['location']['top']
+        ymax = words_result[g['end_line']]['location']['top'] + words_result[g['end_line']]['location']['height']
+        xmin = g['location']['left']
+        xmax = g['location']['left'] + g['location']['width']
+        for line in range(g['line'], g['end_line']+1):
+            left = words_result[line]['location']['left']
+            width = words_result[line]['location']['width']
+            if xmin > left:
+                xmin = left
+            if xmax < left + width:
+                xmax = left + width
+
+        g.update(box={'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax})
+
+    return group_list
+
+
+def show_result(img_file, debug=1):
+    image_color = cv2.imread(img_file)
+    image = cv2.cvtColor(image_color, cv2.COLOR_BGR2GRAY)
+    height = image.shape[0]
+    width = image.shape[1]
+    resp = get_respond_from_json(img_file.replace('.jpg', '_json.txt'))
+    words_result = resp['words_result']
+    print('**********{}*********'.format(os.path.split(img_file)[1]))
+
+    numbers = get_number_position(words_result)
+    number_list = get_number_list(numbers)
+    problem_list = get_problem_list(number_list)
+    group_list = exam_segment(words_result)
+    #double_page_numbers = get_double_page_number(words_result, img.shape[1])
+
+    if debug == 0:
+        for line_index in range(len(words_result)):
+            line = words_result[line_index]
+            print('**************************************')
+            print(line['words'])
+        print('************All Numbers************')
+        for num in numbers:
+            print(num)
+        for numbers in number_list:
+            print('*******Number List********')
+            for n in numbers:
+                print(n)
+    elif debug == 1:
+        print('**********Problem List*********')
+        for p in problem_list:
+            print(p)
+        print('**********Group List**********')
+        for g in group_list:
+            print(g)
+    elif debug == 2:
+        gap = 20
+        middle_word_count = word_projection(words_result, (height, width), left_ratio=0.4, right_ratio=0.6, gap=gap)
+        left_word_count = word_projection(words_result, (height, width), left_ratio=0, right_ratio=0.15, gap=gap)
+        right_word_count = word_projection(words_result, (height, width), left_ratio=0.85, right_ratio=1, gap=gap)
+
+        left_image_projection = image_projection(image, left_ratio=0, right_ratio=0.15, gap=gap)
+        middle_image_projection = image_projection(image, left_ratio=0.4, right_ratio=0.6, gap=gap)
+        right_image_projection = image_projection(image, left_ratio=0.85, right_ratio=1, gap=gap)
+        print('**********Left Projection************')
+        print(left_word_count)
+        print(left_image_projection)
+        #print(get_longest_sequence(left_word_count[0, :], 2))
+        #print(get_longest_sequence(left_image_projection[0, :], 100, type='h'))
+        print('**********Middle Projection************')
+        print(middle_word_count)
+        print(middle_image_projection)
+        print('**********Right Projection************')
+        print(right_word_count)
+        print(right_image_projection)
+        print('************Split Line****************')
+        left_p = check_seal_line(words_result, image, type='left')
+        right_p = check_seal_line(words_result, image, type='right')
+        middle_p = check_double_page(words_result, image)
+        print(left_p, middle_p, right_p)
+        cv2.line(image_color, (left_p, 0), (left_p, height), (0, 0, 255), 5)
+        cv2.line(image_color, (middle_p, 0), (middle_p, height), (0, 255, 0), 5)
+        cv2.line(image_color, (right_p, 0), (right_p, height), (255, 0, 0), 5)
+        cv2.namedWindow('image', cv2.WINDOW_NORMAL)
+        cv2.imshow('image', image_color)
+        if cv2.waitKey(0) == 27:  # press ESC to exit
+            exit(0)
+        cv2.destroyAllWindows()
+    elif debug == 3:
+        page_text = get_page_text(words_result, image)
+        if len(page_text) == 1:
+            print('*************Single Page*********')
+            for line in page_text[0]:
+                print(line['words'])
+        else:
+            print('*************Odd Page**********')
+            for line in page_text[0]:
+                print(line['words'])
+            print('************Even Page**********')
+            for line in page_text[1]:
+                print(line['words'])
+    # elif style == 4:
+    #     print('***********Page Text***********')
+    #     page_result = get_page_text(words_result, image)
+    #     if len(page_result) == 1:
+    #         print('***********Single Page***********')
+    #         for line in page_result[0]:
+    #             print(line['words'])
+    #     elif len(page_result) == 2:
+    #         print('*********Odd************')
+    #         for line in page_result[0]:
+    #             print(line['words'])
+    #         print('********Even************')
+    #         for line in page_result[1]:
+    #             print(line['words'])
+
+
+# if __name__ == "__main__":
+#     img_file = r'E:\data\test-problems\10.jpg'
+#     # show_result(img_file, debug=2)
+#     image_color = cv2.imread(img_file)
+#     image = cv2.cvtColor(image_color, cv2.COLOR_BGR2GRAY)
+#     height = image.shape[0]
+#     width = image.shape[1]
+#     resp = get_respond_from_json(img_file.replace('.jpg', '_json.txt'))
+#     words_result = resp['words_result']
+#     print('**********{}*********'.format(os.path.split(img_file)[1]))
+#     text_list = get_page_text(words_result, image)
+#
+#     # work_dir = r'E:\data\seal_line'
+#     # for img_file in glob.glob(os.path.join(work_dir, '*.jpg')):
+#     #     show_result(img_file, style=2)

+ 15 - 0
segment/image_operation/img_urlcode.py

@@ -0,0 +1,15 @@
+# @Author  : lightXu
+# @File    : img_urlcode.py
+import base64
+
+
+def img2base64(img):
+    base64_data = base64.b64encode(img)
+    print(base64_data)
+    return base64_data
+
+
+if __name__ == '__main__':
+    img_path = r'C:\Users\Administrator\Desktop\history\0002.jpg'
+    with open(img_path, 'rb') as f:
+        img2base64(f.read())

+ 298 - 0
segment/image_operation/pre_segment.py

@@ -0,0 +1,298 @@
+# @Author  : lightXu
+# @File    : pre_segment.py
+import time
+import numpy as np
+import cv2
+from numpy import asarray
+import base64
+import scipy.signal
+
+from segment.image_operation import utils
+
+
+def hough_rotate_cv(image):
+    """ not Long time consuming, not Strong generalization ability, not high accuracy, more super parameters"""
+    img_np = utils.resize_by_percent(asarray(image), 1)
+    if len(img_np.shape) == 3:
+        img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
+    canny_image = cv2.Canny(img_np, 0, 255, apertureSize=3)
+    # cv2.imshow('canny', canny_image)
+    # cv2.waitKey(10)
+    lines = cv2.HoughLinesP(canny_image, 1, np.pi / 180, 160, minLineLength=500, maxLineGap=65)
+    # lines = cv2.HoughLines(canny_image, 1, np.pi / 180, 160, max_theta=30, min_theta=0)
+
+    # 寻找长度最长的线
+    distance = []
+    for line in lines:
+        x1, y1, x2, y2 = line[0]
+        dis = np.sqrt(pow((x2 - x1), 2) + pow((y2 - y1), 2))
+        distance.append(dis)
+    max_dis_index = distance.index(max(distance))
+    max_line = lines[max_dis_index]
+    x1, y1, x2, y2 = max_line[0]
+
+    # 获取旋转角度
+    angle = cv2.fastAtan2((y2 - y1), (x2 - x1))
+    print(angle)
+
+    if 0.5 <= angle <= 7:  # 因为识别误差问题,根据实际情况设置旋转阈值
+        centerpoint = (image.shape[1] / 2, image.shape[0] / 2)
+        rotate_mat = cv2.getRotationMatrix2D(centerpoint, angle, 1.0)  # 获取旋转矩阵
+        correct_image = cv2.warpAffine(image, rotate_mat, (image.shape[1], image.shape[0]),
+                                       borderValue=(255, 255, 255))
+
+        # cv2.imshow('test', resize_by_percent(correct_image, 0.1))
+        # cv2.waitKey(10)
+        return correct_image
+    else:
+        return image
+
+
+def array_latter_subtracts_precious(nparray):
+    array1 = nparray[:-1]
+    array2 = nparray[1:]
+    return array2 - array1
+
+
+def split_by_index(im_raw, index):
+    y_raw, x_raw, _ = im_raw.shape
+    img_left = im_raw[1:y_raw, 1:index]
+    img_right = im_raw[1:y_raw, index + 1:x_raw]
+    return img_left, img_right
+
+
+def split_img_at_middle_by_y_axis(img_path, radio=0.10, thresh_std=5000):
+    im_raw = utils.read_img(img_path)
+    im_resize = utils.resize_by_percent(im_raw, radio)
+    ry, rx, _ = im_resize.shape
+    img_mtx0 = np.asarray(utils.rgb2binary(im_resize))
+    y_sum_array0 = img_mtx0.sum(axis=0)
+    tmp = array_latter_subtracts_precious(y_sum_array0 / ry)
+    std0 = np.std(tmp)  # 计算标准差
+
+    # # plt.bar(range(len(y_sum_array0)), y_sum_array0)
+    # # plt.show()
+    # plt.plot(range(len(y_sum_array0)-1), tmp)
+    # plt.show()
+
+    y, x, _z = im_resize.shape
+    x_bias = int(x * 0.15)
+    y_bias = int(y * 0.30)
+    middle_x = int(x / 2)
+    middle_area_img = im_resize[y_bias:y, middle_x - x_bias:middle_x + x_bias]
+    img_mtx = np.asarray(utils.rgb2binary(middle_area_img))
+    y_sum_array = img_mtx.sum(axis=0)
+    std = np.std(y_sum_array)  # 计算标准差
+    y_sum_list = list(y_sum_array)
+
+    if std <= thresh_std:
+        index = y_sum_list.index(max(y_sum_list))
+    else:
+        index = y_sum_list.index(min(y_sum_list))
+    split_index = middle_x + index - int(len(y_sum_list) / 2)
+    split_index = int(split_index / radio)
+
+    y_raw, x_raw, _ = im_raw.shape
+    img_left = im_raw[1:y_raw, 1:split_index]
+    img_right = im_raw[1:y_raw, split_index + 1:x_raw]
+    left_path = img_path.replace('.jpg', '_left.jpg')
+    right_path = img_path.replace('.jpg', '_right.jpg')
+    cv2.imencode('.jpg', img_left)[1].tofile(left_path)
+    cv2.imencode('.jpg', img_right)[1].tofile(right_path)
+    print(left_path)
+    print(right_path)
+
+
+def smart_split_img_at_middle_by_x_axis(img_path, resize_radio=0.1):
+    im_raw = utils.read_img(img_path)
+    im_resize = utils.resize_by_percent(im_raw, resize_radio)
+
+    bin_img = utils.rgb2binary(im_resize)
+    ry, rx = bin_img.shape
+    img_mtx0 = np.asarray(bin_img)
+    y_sum_array0 = img_mtx0.sum(axis=0)  # y轴求和
+    subtracts_arr = np.abs(array_latter_subtracts_precious(y_sum_array0 / ry))  # 长度减1
+    subtracts_arr_index = np.argsort(subtracts_arr, kind='quicksort', order=None)
+    subtracts_arr_index = subtracts_arr_index[-10:]
+
+    index_middle_distance_list = list(np.abs(subtracts_arr_index - int(rx / 2)))
+    split_index = subtracts_arr_index[index_middle_distance_list.index(min(index_middle_distance_list))] + 1
+    split_index = int(split_index / resize_radio)
+    img_left, img_right = split_by_index(im_raw, split_index)
+    left_path = img_path.replace('.jpg', '_left.jpg')
+    right_path = img_path.replace('.jpg', '_right.jpg')
+    cv2.imencode('.jpg', img_left)[1].tofile(left_path)
+    cv2.imencode('.jpg', img_right)[1].tofile(right_path)
+    print(left_path)
+    print(right_path)
+
+
+def segment2parts_by_pix(crop_img):
+
+    p_image = utils.preprocess(crop_img)
+    height, width = p_image.shape
+    sum_x_axis = p_image.sum(axis=0) / (height*255)
+
+    # sum_x_axis = (sum_x_axis / (255*height)).astype(float)
+    kernel = np.array([-2, 0, 2])
+    sobel_filter = scipy.signal.convolve(sum_x_axis, kernel)  # 一维卷积运算
+
+    temp = np.abs(sobel_filter[1:-1])/np.max(np.abs(sobel_filter[1:-1]))
+    temp[temp < 0.6] = 0
+    temp[temp != 0] = 1
+    index = np.where(temp == 1)[0]
+
+    width1 = width // 9
+
+    intervals = [(0, width1), (4 * width1, 5 * width1), (8 * width1, width)]  # 左开右闭
+
+    index_list = []
+    for i, interval in enumerate(intervals):
+        index_sec_list = []
+        for ele in index:
+            if interval[0] < ele <= interval[1]:
+                index_sec_list.append(ele)
+
+        index_list.append(index_sec_list)
+
+    left_x_point, middle_x_point, right_x_point = 9999, 9999, 9999
+    left_del_part = (0, left_x_point)
+    middle_part = (left_x_point, middle_x_point)
+    right_part = (middle_x_point, right_x_point)
+    right_del_part = (right_x_point, width)
+
+    # left
+    if index_list[0]:
+        left_x_point = index_list[0][-1]
+        left_del_part = (0, left_x_point)
+    # middle
+    if index_list[1]:
+        value_list = [abs(sobel_filter[index]) for index in index_list[1]]
+        middle_x_point = index_list[1][value_list.index(max(value_list))]
+        middle_part = (left_x_point, middle_x_point)
+    # right
+    if index_list[2]:
+        right_x_point = index_list[2][0]
+        right_part = (middle_x_point, right_x_point)
+        right_del_part = (right_x_point, width)
+
+    split_point = sorted(list(set(sorted(list(left_del_part + middle_part + right_part + right_del_part))) - {9999}))
+
+    split_pairs = []
+    if len(split_point) > 2:
+        a = split_point[:-1]
+        b = split_point[1:]
+        for i, ele in enumerate(a):
+            if b[i] - ele > width1:
+                split_pairs.append((ele, b[i]))
+
+    return split_pairs
+
+
+def segment2parts(im_raw, save_path):
+    img_parts_dict_list = []
+
+    # randon_img = radon_rotate_ski(im_raw)
+    # 试卷顶部可能有黑边,切去3%
+    yy, xx = im_raw.shape[0], im_raw.shape[1]
+    y_crop_pix = int(yy*0.03)
+    # x_crop_pix = int(xx*0.03)
+    x_crop_pix = 0
+    im_crop = im_raw[y_crop_pix:yy-y_crop_pix, x_crop_pix:xx-x_crop_pix]
+
+    split_pairs = segment2parts_by_pix(im_crop)
+    if len(split_pairs) >= 2:
+        for index, ele in enumerate(split_pairs):
+            dst = im_raw[:, ele[0]:ele[1]]
+            save_path_final = save_path.replace('.jpg', '') + '_{}_{}_{}.jpg'.format(ele[0], 0, index)
+            cv2.imencode('.jpg', dst)[1].tofile(save_path_final)
+            image = cv2.imencode('.jpg', dst)[1]
+            base64_data = str(base64.b64encode(image))[2:-1]
+            part_dict = {'img_part': base64_data,
+                         'x_bias': ele[0] + x_crop_pix,
+                         'y_bias': 0}
+
+            img_parts_dict_list.append(part_dict)
+
+    else:
+        img = im_crop[:, split_pairs[0][0]:split_pairs[0][1]]
+        resize_ratio = 0.3
+        im_resize = utils.resize_by_percent(img, resize_ratio)
+
+        # gray
+        if len(im_resize.shape) >= 3:
+            gray_img = cv2.cvtColor(im_resize, cv2.COLOR_BGR2GRAY)
+        else:
+            gray_img = im_resize
+        ry, rx = gray_img.shape
+        # 高斯
+        glur_img = cv2.GaussianBlur(gray_img, (5, 5), 0)
+        # otsu
+        _ret, threshed_img = cv2.threshold(glur_img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+        if ry < rx:
+            x_kernel = int(10*resize_ratio)
+        else:
+            x_kernel = int(10 * resize_ratio)
+        kernel = np.ones((glur_img.shape[0], x_kernel), np.uint8)  # height, width
+        dilation = cv2.dilate(threshed_img, kernel, iterations=1)
+        # cv2.imshow(' ', dilation)
+        # if cv2.waitKey(0) == 27:
+        #     cv2.destroyAllWindows()
+
+        # _, cnts, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        (major, minor, _) = cv2.__version__.split(".")
+        contours = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        cnts = contours[0] if int(major) > 3 else contours[1]
+
+        box_list = [cv2.boundingRect(cnt) for cnt in cnts]
+        box_array = np.asarray(box_list)
+        box_array[:, 2] = box_array[:, 0] + box_array[:, 2]
+        box_array[:, 3] = box_array[:, 1] + box_array[:, 3]
+
+        middle_x = rx // 2
+        left_box = np.asarray([0, 0, 0, 0])
+        right_box = np.asarray([0, 0, 0, 0])
+        for box in box_array:
+            x, y, xmax, ymax = box
+            if x + (xmax-x)//2 <= middle_x:
+                left_box = np.vstack([left_box, box])
+            else:
+                right_box = np.vstack([right_box, box])
+
+        left_box_list = []
+        right_box_list = []
+        try:
+            left_box_list = left_box[1:, :][:, :2].min(axis=0).tolist() + left_box[1:, :][:, 2:].max(axis=0).tolist()
+        except Exception:
+            pass  # 单面的情况
+        try:
+            right_box_list = right_box[1:, :][:, :2].min(axis=0).tolist() + right_box[1:, :][:, 2:].max(axis=0).tolist()
+        except Exception:
+            pass
+
+        box_list = [left_box_list, right_box_list]
+
+        bias = int(70 * resize_ratio)
+        for index, box in enumerate(box_list):
+            if len(box) > 0:
+                xmin, ymin, xmax, ymax = box
+                if xmin - bias > 0:
+                    xmin = xmin - bias
+                else:
+                    xmin = 0
+
+                dst = im_crop[int(ymin / resize_ratio):int(ymax / resize_ratio),
+                      int(xmin / resize_ratio):int(xmax / resize_ratio)]
+                save_path_final = save_path.replace('.jpg', '') + '_{}_{}_{}.jpg'.format(xmin, ymin, index)
+                cv2.imencode('.jpg', dst)[1].tofile(save_path_final)
+                image = cv2.imencode('.jpg', dst)[1]
+                base64_data = str(base64.b64encode(image))[2:-1]
+                part_dict = {'img_part': base64_data,
+                             'x_bias': int(xmin/resize_ratio) + x_crop_pix + split_pairs[0][0],
+                             'y_bias': int(ymin/resize_ratio) + y_crop_pix + 0}
+                if (xmax - xmin)/resize_ratio > 100:  # 去掉竖长条
+                    img_parts_dict_list.append(part_dict)
+
+    return img_parts_dict_list
+

+ 47 - 0
segment/image_operation/segment.py

@@ -0,0 +1,47 @@
+# @Author  : lightXu
+# @File    : segment.py
+import os
+
+import xml.etree.cElementTree as ET
+
+from segment.image_operation import utils
+
+
+def joint_image(raw_img_path, bbox, lines_list):
+    lines_dir = raw_img_path.replace('.jpg', '_lines')
+    lines_file_list = os.listdir(lines_dir)
+    lines_file_list = sorted([ele.replace('jpg', '')
+                              for ele in lines_file_list if ele.endswith('.jpg')])
+
+    exam_items_bbox = []
+
+    tree = ET.parse(r'./segment/exam_info/000000-template.xml')  # xml tree
+    for index_num, j in enumerate(lines_list):
+        if j[1] == j[0]:
+            continue
+        elif j[1] - j[0] == 1:
+            index_list = lines_file_list[j[0]].split('_')
+            y_low = int(index_list[0])
+            y_high = int(index_list[1])
+            x_low = int(index_list[2])
+            x_high = int(index_list[3])
+        else:
+            index_list0 = lines_file_list[j[0]].split('_')  # [33, 37]
+            index_list1 = lines_file_list[j[1] - 1].split('_')
+            y_low = int(index_list0[0])
+            y_high = int(index_list1[1])
+
+            tmp_x_low_list = [ele.split('_')[2]
+                              for ele in lines_file_list[j[0]:j[1]]]
+            tmp_x_high_list = [ele.split('_')[3]
+                               for ele in lines_file_list[j[0]:j[1]]]
+            x_low = int(min(tmp_x_low_list))
+            x_high = int(max(tmp_x_high_list))
+        exam_bbox = [bbox[2] + x_low, bbox[0] + y_low, bbox[2] + x_high, bbox[0] + y_high]
+
+        tree = utils.create_xml('{:02d}'.format(index_num), tree,
+                                exam_bbox[0], exam_bbox[1], exam_bbox[2], exam_bbox[3])
+        exam_items_bbox.append(exam_bbox)
+    # print(exam_items_bbox)
+    tree.write(raw_img_path.replace('.jpg', '.xml'))
+    return exam_items_bbox

+ 94 - 0
segment/image_operation/split_lines.py

@@ -0,0 +1,94 @@
+# @Author  : lightXu
+# @File    : split_lines.py
+import os
+
+import cv2
+import numpy as np
+
+from segment.image_operation import utils
+
+from django.conf import settings
+
+
+def find_contours(resized_img, ex_x, ex_y):
+    threshed = utils.rgb2binary(resized_img)
+
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (ex_x, ex_y))  # 膨胀系数
+    # morphed = cv2.morphologyEx(threshed, cv2.MORPH_CLOSE, kernel)
+    morphed = cv2.dilate(threshed, kernel, iterations=1)
+
+    _, cnts, hierarchy = cv2.findContours(morphed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    cnt = sorted(cnts, key=cv2.contourArea)[-1]
+    x, y, w, h = cv2.boundingRect(cnt)
+    x = x + int(ex_x * 0.5)
+    w = w - int(ex_x * 0.5)
+    dst = threshed[y:y + h, x:x + w]
+    return dst, (y, y + h, x, x + w), cnts
+
+
+def save_lines_by_index_without_white_line(path, split_img, split_index, resize_radio):
+    img_y = split_img.shape[0]
+    img_x = split_img.shape[1]
+    lines_list = []
+    for i in range(1, len(split_index)):
+        if i % 2 != 1:
+            start0 = int((split_index[i - 1] - 2) / resize_radio)  # 0,1间隔, 交替相减a2-a1, 每行上下的白多一点
+            end0 = int((split_index[i] - 1 + 2) / resize_radio)  # 前一个索引
+            start = start0 if (start0 >= 0) else 0
+            end = end0 if (end0 <= img_y) else img_y
+            line = split_img[start:end, 1:img_x]
+            if len(line) < 1:
+                continue
+
+            _, _, cnts = find_contours(line, 500, 70)  # x轴膨胀,去掉每行的白色, 第二个参数按行膨胀,第三个参数按列膨胀
+            for cnt_id, cnt in enumerate(reversed(cnts)):
+                x, y, w, h = cv2.boundingRect(cnt)
+
+                # print(x, y, w, h)
+                if w * h > 100:
+                    cj_out = line[y:y + h, x:x + w]
+                    # line_list.append(cj_out)
+                    save_path = os.path.join(path,
+                                             '{:04d}_{:04d}_{:04d}_{:04d}_{}.jpg'.format(start, end, x, x+w, cnt_id))
+                    cv2.imencode('.jpg', cj_out)[1].tofile(save_path)
+                    # print(save_path)
+                    filename = os.path.abspath(save_path)
+                    lines_list.append(filename)
+    return lines_list
+
+
+def line_split(path, save_path, tolerance_pix_number):
+    resize_radio = settings.RESIZE_RADIO
+    images = utils.read_img(path)
+    # raw_y = images.shape[0]
+    # raw_x = images.shape[1]
+    # images = images[:raw_y, int(raw_x * 0.05):raw_x - int(raw_x * 0.05)]
+
+    resize_img = utils.resize_by_percent(images, resize_radio)
+    resize_crop_imgs, max_bbox, _ = find_contours(resize_img, 10, 200)  # y轴膨胀,整体去掉白色,去掉扫描后图像边界的黑色线条
+
+    bbox = [int(ele / resize_radio) for ele in max_bbox]
+
+    img_arr = np.asarray(resize_crop_imgs)
+    img_size = img_arr.shape
+    width = img_size[1]
+
+    sum_x_axis = img_arr.sum(axis=1) / width
+    # hei[hei <= 254] = 0  # black
+    sum_x_axis[sum_x_axis > 255 * tolerance_pix_number / width] = 1  # white
+    sum_x_axis[sum_x_axis != 1] = 0
+    sum_x_axis_list = list(sum_x_axis)
+
+    split_index0 = []
+    num = 0
+    for i, ele in enumerate(sum_x_axis_list):
+        num = num % 2
+        if ele == num:
+            # print(i)
+            num = num + 1
+            split_index0.append(i)
+
+    split_img0 = images[bbox[0]:bbox[1], bbox[2]:bbox[3]]
+    lines_list = save_lines_by_index_without_white_line(save_path, split_img0, split_index0, resize_radio)
+    return bbox, lines_list

+ 207 - 0
segment/image_operation/utils.py

@@ -0,0 +1,207 @@
+# @Author  : lightXu
+# @File    : utils.py
+import os
+
+import cv2
+import numpy as np
+import xml.etree.cElementTree as ET
+from PIL import Image
+
+
+def read_img(img_path):
+    try:
+        im = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), -1)
+    except FileNotFoundError as e:
+        raise e
+    return im
+
+
+def write_img(img_to_wwite, save_path):
+    try:
+        cv2.imencode('.jpg', img_to_wwite)[1].tofile(save_path)
+    except FileNotFoundError as e:
+        raise e
+
+
+def crop_region_direct(im, bbox):
+    xmin = bbox[0]
+    ymin = bbox[1]
+    xmax = bbox[2]
+    ymax = bbox[3]
+
+    region = im[ymin:ymax, xmin:xmax]
+    return region
+
+
+def resize_by_percent(im, percent):
+    """
+    :param im:
+    :param percent:
+    :return: resize_img
+
+    interpolation - 插值方法。共有5种:
+    1)INTER_NEAREST - 最近邻插值法
+    2)INTER_LINEAR - 双线性插值法(默认)
+    3)INTER_AREA - 基于局部像素的重采样(resampling using pixel area relation)。
+      对于图像抽取(image decimation)来说,这可能是一个更好的方法。但如果是放大图像时,它和最近邻法的效果类似。
+    4)INTER_CUBIC - 基于4x4像素邻域的3次插值法
+    5)INTER_LANCZOS4 - 基于8x8像素邻域的Lanczos插值
+    """
+
+    height = im.shape[0]
+    width = im.shape[1]
+    new_x = int(width * percent)
+    new_y = int(height * percent)
+
+    res = cv2.resize(im, (new_x, new_y), interpolation=cv2.INTER_AREA)
+
+    return res
+
+
+def resize_by_fixed_size(im, new_x, new_y):
+    """
+    :param new_y: y轴像素
+    :param new_x: x轴像素
+    :param im:
+    :return: resize_img
+
+    interpolation - 插值方法。共有5种:
+    1)INTER_NEAREST - 最近邻插值法
+    2)INTER_LINEAR - 双线性插值法(默认)
+    3)INTER_AREA - 基于局部像素的重采样(resampling using pixel area relation)。
+      对于图像抽取(image decimation)来说,这可能是一个更好的方法。但如果是放大图像时,它和最近邻法的效果类似。
+    4)INTER_CUBIC - 基于4x4像素邻域的3次插值法
+    5)INTER_LANCZOS4 - 基于8x8像素邻域的Lanczos插值
+    """
+    res = cv2.resize(im, (new_x, new_y), interpolation=cv2.INTER_AREA)
+
+    return res
+
+
+def resize_by_radio(im):
+    """
+    :param im:
+    :return: resize_img
+
+    interpolation - 插值方法。共有5种:
+    1)INTER_NEAREST - 最近邻插值法
+    2)INTER_LINEAR - 双线性插值法(默认)
+    3)INTER_AREA - 基于局部像素的重采样(resampling using pixel area relation)。
+      对于图像抽取(image decimation)来说,这可能是一个更好的方法。但如果是放大图像时,它和最近邻法的效果类似。
+    4)INTER_CUBIC - 基于4x4像素邻域的3次插值法
+    5)INTER_LANCZOS4 - 基于8x8像素邻域的Lanczos插值
+    """
+    # res = cv2.resize(im, (new_x, new_y), interpolation=cv2.INTER_AREA)
+
+    longer = 750
+    shorter = 500
+
+    im_shape = im.shape
+    im_size_min = np.min(im_shape[0:2])
+    res = im
+    if im_size_min > 500:
+        im_size_max = np.max(im_shape[0:2])
+        im_scale = float(shorter) / float(im_size_min)
+        # Prevent the biggest axis from being more than MAX_SIZE
+        if np.round(im_scale * im_size_max) > longer:
+            im_scale = float(longer) / float(im_size_max)
+        res = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
+                         interpolation=cv2.INTER_AREA)
+
+    return res
+
+
+def rgb2binary(im):
+    gray_img = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
+    _ret, thresh_img = cv2.threshold(gray_img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    return thresh_img
+
+
+def create_xml(obj_name, tree, xmin, ymin, xmax, ymax):
+    root = tree.getroot()
+
+    pobject = ET.SubElement(root, 'object', {})
+    pname = ET.SubElement(pobject, 'name')
+    pname.text = obj_name
+    ppose = ET.SubElement(pobject, 'pose')
+    ppose.text = 'Unspecified'
+    ptruncated = ET.SubElement(pobject, 'truncated')
+    ptruncated.text = '0'
+    pdifficult = ET.SubElement(pobject, 'difficult')
+    pdifficult.text = '0'
+    # add bndbox
+    pbndbox = ET.SubElement(pobject, 'bndbox')
+    pxmin = ET.SubElement(pbndbox, 'xmin')
+    pxmin.text = str(xmin)
+
+    pymin = ET.SubElement(pbndbox, 'ymin')
+    pymin.text = str(ymin)
+
+    pxmax = ET.SubElement(pbndbox, 'xmax')
+    pxmax.text = str(xmax)
+
+    pymax = ET.SubElement(pbndbox, 'ymax')
+    pymax.text = str(ymax)
+
+    return tree
+
+
+def preprocess(img, binary_inv=True):
+    dilate = 1
+    blur = 1
+
+    if len(img.shape) >= 3:
+        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    else:
+        gray_img = img
+
+    # # Apply dilation and erosion to remove some noise
+    if dilate != 0:
+        kernel = np.ones((dilate, dilate), np.uint8)
+        img = cv2.dilate(gray_img, kernel, iterations=1)
+        img = cv2.erode(img, kernel, iterations=1)
+
+    # Apply blur to smooth out the edges
+    if blur != 0:
+        img = cv2.GaussianBlur(img, (blur, blur), 0)
+
+    # Apply threshold to get image with only b&w (binarization)
+    if binary_inv:
+        img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+    else:
+        img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
+
+    return img
+
+
+def write_single_img(dst, save_path):
+    try:
+        cv2.imencode('.jpg', dst)[1].tofile(save_path)
+    except FileNotFoundError as e:
+        raise e
+
+
+def png2jpg(png_path):
+    try:
+        im = Image.open(png_path)
+        jpg_path = png_path.replace('.png', '.jpg')
+        bg = Image.new("RGB", im.size, (255, 255, 255))
+        bg.paste(im, im)
+        bg.save(jpg_path)
+        return jpg_path
+    except Exception as e:
+        print("PNG转换JPG 错误", e)
+
+
+def png_read(img_file):
+    raw_img = Image.open(img_file)  # 读取上传的网络图像
+
+    channels = raw_img.split()
+    if len(channels) > 3:
+        img = Image.merge("RGB", (channels[1], channels[2], channels[3]))
+        open_cv_image = np.array(img)
+
+    else:
+        img = raw_img
+        open_cv_image = np.array(img)
+    return open_cv_image

+ 53 - 0
segment/logging.conf

@@ -0,0 +1,53 @@
+[loggers]
+keys=root,test, production
+
+[handlers]
+keys=rotatingFileHandler,testHandler,consoleHandler, productionHandler
+
+[formatters]
+keys=simpleFmt
+
+[logger_root]
+level=DEBUG
+handlers=rotatingFileHandler
+
+[logger_test]
+level=DEBUG
+handlers=testHandler
+qualname=test
+propagate=0
+
+[logger_production]
+level=DEBUG
+handlers=productionHandler
+qualname=production
+propagate=0
+
+[handler_rotatingFileHandler]
+class=handlers.RotatingFileHandler
+level=INFO
+formatter=simpleFmt
+args=("./default_log.log", "a", 20*1024*1024, 10, 'utf-8')
+
+[handler_testHandler]
+class=handlers.RotatingFileHandler
+level=INFO
+formatter=simpleFmt
+args=("./test_log.log", "a", 20*1024*1024, 10, 'utf-8')
+
+[handler_consoleHandler]
+class=StreamHandler
+level=DEBUG
+formatter=simpleFmt
+args=(sys.stdout,)
+
+[handler_productionHandler]
+class=handlers.RotatingFileHandler
+level=INFO
+formatter=simpleFmt
+args=("./log.log", "a", 20*1024*1024, 10, 'utf-8')
+
+
+[formatter_simpleFmt]
+format=%(asctime)s - %(name)s - %(levelname)s - %(message)s - [%(filename)s:%(lineno)s]
+

+ 11 - 0
segment/logging_config.py

@@ -0,0 +1,11 @@
+# @Author  : lightXu
+# @File    : logging_config.py
+import logging
+import logging.config
+
+
+def getLogger(name='root'):
+    CONF_LOG = "./segment/logging.conf"
+    logging.config.fileConfig(CONF_LOG)
+
+    return logging.getLogger(name)

+ 25 - 0
segment/migrations/0001_initial.py

@@ -0,0 +1,25 @@
+# Generated by Django 2.1 on 2018-10-09 07:23
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='ExamImage',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('img_name', models.CharField(max_length=150, null=True)),
+                ('subject_id', models.IntegerField()),
+                ('subject', models.CharField(default='unknown_subject', max_length=20)),
+                ('upload_date', models.DateField(verbose_name='保存日期')),
+                ('save_path', models.CharField(max_length=150)),
+            ],
+        ),
+    ]

+ 18 - 0
segment/migrations/0002_auto_20181010_1008.py

@@ -0,0 +1,18 @@
+# Generated by Django 2.1 on 2018-10-10 02:08
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('segment', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.RenameField(
+            model_name='examimage',
+            old_name='img_name',
+            new_name='raw_name',
+        ),
+    ]

+ 21 - 0
segment/migrations/0003_ocrtoken.py

@@ -0,0 +1,21 @@
+# Generated by Django 2.1 on 2018-10-25 02:43
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('segment', '0002_auto_20181010_1008'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='OcrToken',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('upload_date', models.DateField(verbose_name='注册日期')),
+                ('access_token', models.CharField(max_length=150, null=True)),
+            ],
+        ),
+    ]

+ 18 - 0
segment/migrations/0004_auto_20181025_1329.py

@@ -0,0 +1,18 @@
+# Generated by Django 2.1 on 2018-10-25 05:29
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('segment', '0003_ocrtoken'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='ocrtoken',
+            name='upload_date',
+            field=models.DateTimeField(verbose_name='注册日期'),
+        ),
+    ]

+ 18 - 0
segment/migrations/0005_auto_20181025_1332.py

@@ -0,0 +1,18 @@
+# Generated by Django 2.1 on 2018-10-25 05:32
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('segment', '0004_auto_20181025_1329'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='ocrtoken',
+            name='upload_date',
+            field=models.DateTimeField(auto_now=True, verbose_name='注册日期'),
+        ),
+    ]

+ 22 - 0
segment/migrations/0006_auto_20181025_1341.py

@@ -0,0 +1,22 @@
+# Generated by Django 2.1 on 2018-10-25 13:41
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('segment', '0005_auto_20181025_1332'),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name='ocrtoken',
+            name='upload_date',
+        ),
+        migrations.AddField(
+            model_name='ocrtoken',
+            name='update_time',
+            field=models.DateTimeField(auto_now=True, verbose_name='更新日期'),
+        ),
+    ]

+ 27 - 0
segment/migrations/0007_sheetbigboxes.py

@@ -0,0 +1,27 @@
+# Generated by Django 2.1 on 2019-04-03 15:26
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('segment', '0006_auto_20181025_1341'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='SheetBigBoxes',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('update_time', models.DateTimeField(auto_now=True, verbose_name='更新日期')),
+                ('series_number', models.CharField(max_length=100, null=True)),
+                ('raw_name', models.CharField(max_length=100, null=True)),
+                ('save_path', models.CharField(max_length=100, null=True)),
+                ('raw_big_box_path', models.CharField(max_length=100, null=True)),
+                ('small_box_path', models.CharField(max_length=100, null=True)),
+                ('subject_id', models.IntegerField(default=0)),
+                ('subject', models.CharField(default='unknown_subject', max_length=20)),
+            ],
+        ),
+    ]

+ 40 - 0
segment/migrations/0007_sheetbigboxes_sheetboxes.py

@@ -0,0 +1,40 @@
+# Generated by Django 2.1.2 on 2019-09-26 15:40
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('segment', '0006_auto_20181025_1341'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='SheetBigBoxes',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('update_time', models.DateTimeField(auto_now=True, verbose_name='更新日期')),
+                ('series_number', models.CharField(max_length=100, null=True)),
+                ('raw_name', models.CharField(max_length=100, null=True)),
+                ('small_box_path', models.CharField(max_length=100, null=True)),
+                ('subject_id', models.IntegerField(default=0)),
+                ('save_path', models.CharField(max_length=100, null=True)),
+                ('raw_big_box_path', models.CharField(max_length=100, null=True)),
+                ('subject', models.CharField(default='unknown_subject', max_length=20)),
+            ],
+        ),
+        migrations.CreateModel(
+            name='SheetBoxes',
+            fields=[
+                ('update_time', models.DateTimeField(auto_now=True, verbose_name='更新日期')),
+                ('paper_id', models.CharField(max_length=100, primary_key=True, serialize=False)),
+                ('raw_name', models.CharField(max_length=100, null=True)),
+                ('subject_id', models.IntegerField(default=0)),
+                ('subject', models.CharField(default='unknown_subject', max_length=20)),
+                ('save_path', models.CharField(max_length=100, null=True)),
+                ('xml_box_path', models.CharField(max_length=100, null=True)),
+                ('download_path', models.CharField(default='', max_length=100)),
+            ],
+        ),
+    ]

+ 26 - 0
segment/migrations/0008_sheetboxes.py

@@ -0,0 +1,26 @@
+# Generated by Django 2.1 on 2019-06-25 15:40
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('segment', '0007_sheetbigboxes'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='SheetBoxes',
+            fields=[
+                ('update_time', models.DateTimeField(auto_now=True, verbose_name='更新日期')),
+                ('paper_id', models.CharField(max_length=100, primary_key=True, serialize=False)),
+                ('raw_name', models.CharField(max_length=100, null=True)),
+                ('subject_id', models.IntegerField(default=0)),
+                ('subject', models.CharField(default='unknown_subject', max_length=20)),
+                ('save_path', models.CharField(max_length=100, null=True)),
+                ('xml_box_path', models.CharField(max_length=100, null=True)),
+                ('download_path', models.CharField(default='', max_length=100)),
+            ],
+        ),
+    ]

+ 0 - 0
segment/migrations/__init__.py


+ 39 - 0
segment/models.py

@@ -0,0 +1,39 @@
+from django.db import models
+
+
+# Create your models here.
+class ExamImage(models.Model):
+    raw_name = models.CharField(max_length=150, null=True)
+    subject_id = models.IntegerField()
+    subject = models.CharField(max_length=20, default='unknown_subject')
+    upload_date = models.DateField('保存日期')
+    save_path = models.CharField(max_length=150, null=False)
+
+
+class OcrToken(models.Model):
+    update_time = models.DateTimeField('更新日期', auto_now=True)
+    access_token = models.CharField(max_length=150, null=True)
+
+
+class SheetBigBoxes(models.Model):
+    update_time = models.DateTimeField('更新日期', auto_now=True)
+    series_number = models.CharField(max_length=100, null=True)
+    raw_name = models.CharField(max_length=100, null=True)
+    small_box_path = models.CharField(max_length=100, null=True)
+
+    subject_id = models.IntegerField(default=0)
+    save_path = models.CharField(max_length=100, null=True)
+    raw_big_box_path = models.CharField(max_length=100, null=True)
+    subject = models.CharField(max_length=20, default='unknown_subject')
+
+
+class SheetBoxes(models.Model):
+    update_time = models.DateTimeField('更新日期', auto_now=True)
+    paper_id = models.CharField(max_length=100, primary_key=True)
+    raw_name = models.CharField(max_length=100, null=True)
+    subject_id = models.IntegerField(default=0)
+    subject = models.CharField(max_length=20, default='unknown_subject')
+
+    save_path = models.CharField(max_length=100, null=True)
+    xml_box_path = models.CharField(max_length=100, null=True)
+    download_path = models.CharField(max_length=100, default='')

+ 43 - 0
segment/ocr/BD_OCR.py

@@ -0,0 +1,43 @@
+from selenium import webdriver
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+import time
+import traceback
+import re
+
+
+def bd_ocr_file(pictures):
+    browser = webdriver.Chrome()
+    browser.implicitly_wait(5)
+    browser.maximize_window()
+    browser.get('http://ai.baidu.com/tech/ocr/general')
+    browser.execute_script("window.scrollTo(0, 850)")
+    texts = []
+    wait = WebDriverWait(browser, 5)
+    wait.until(EC.presence_of_element_located((By.ID, "demo-photo-upload")))  # 等待id为table的元素被加载出来
+
+    for picture in pictures:
+        time.sleep(2)
+        print("开始传文件")
+        try:
+            browser.find_element_by_css_selector('input[type="file"]').send_keys(picture)
+            time.sleep(3)
+            html = browser.find_element_by_id("demo-json").text
+            res = re.compile(r'"words": "(.*)?"').findall(html)
+            res = ' '.join(res)
+            # print("开始图片识别")
+            if not res:
+                res = '空白'
+            # print(res)
+            texts.append(res)
+            # print("完成图片识别\n")
+        except Exception as e:
+            print(e)
+            traceback.print_exc()
+            browser.quit()
+    browser.quit()
+    return texts
+
+
+

+ 2 - 0
segment/ocr/__init__.py

@@ -0,0 +1,2 @@
+# @Author  : lightXu
+# @File    : __init__.py.py

+ 147 - 0
segment/ocr/group_pictures.py

@@ -0,0 +1,147 @@
+import re
+import shutil
+
+import glob
+from pprint import pprint
+
+import segment.ocr.luo_ocr.ocr as luo_ocr
+# from pypinyin import lazy_pinyin
+from segment.ocr.split_topic_en import topic_type_line
+
+# def to_pinyin_camel(s):
+#     '''文件123.txt'''
+#     py_ls = lazy_pinyin(s)
+#     py_camel = [py.capitalize() for py in py_ls]
+#     return "".join(py_camel)
+#
+#
+# def rename_filename(filename):
+#     "将文件名转变为拼音"
+#     filename_en = to_pinyin_camel(filename)
+#     try:
+#         shutil.copy(filename, filename_en)
+#     except shutil.SameFileError:
+#         pass
+#     return filename_en
+
+
+# def request_ocr(filename):
+#     '''中文无法上传需要修改成英文'''
+#     url = "http://117.50.17.141/ocr"
+#     data = {}
+#     filename = rename_filename(filename)
+#     files = {"mydata": open(filename, "rb")}
+#     r = requests.post(url, data, files=files)
+#     print(filename)
+#     print(r.json())
+#     return r.json()['text']
+
+
+topic_start = re.compile("^\s*(\d+)\s*[\.、::,,]")
+topic_start2 = re.compile("^\s*[(<〈《]?(\d+)\)\s*[\.、::,,]?")
+
+
+def is_topic_start(s, subject):
+    """开始节点"""
+    if subject in ['math', 'english', 'chinese', 'physics', 'chemistry', 'biology']:
+        if topic_start.match(s):
+            return True
+        elif subject == 'math':
+            if topic_start2.match(s):
+                return True
+        return False
+    else:
+        raise ValueError("subject={} is not supported!".format(subject))
+
+
+# -------------------------符合下列条件的则为结束-------------------------
+topic_end = re.compile("D\s*[\.、::]")
+topic_end2 = re.compile("^\s*G\s*[\.、::]")
+
+
+def is_topic_end(s, subject):
+    """结束节点"""
+    if subject in ['math', 'english', 'chinese', 'physics', 'chemistry', 'biology']:
+        if topic_end.search(s):
+            return True
+        elif subject == 'english':
+            if topic_end2.match(s):
+                return True
+        return False
+    else:
+        raise ValueError("subject={} is not supported!".format(subject))
+
+
+# -------------------------符合下列条件的则为跳过舍去-------------------------
+topic_filter = re.compile("^\s*[一二三四五六七八九十]+\s*[\.、::]")
+general_filter = ['选择题', '单选题', '多选题',
+                  '填空题', '单空题', '多空题',
+                  '解答题', '简答题', '证明题',
+                  '选做题', '实验题', '第II卷',
+                  '第I卷', '第二卷', ]
+
+english_filter = [["听", "材料"], ["听", "对话"], ["听", "独白"],
+                  ['第一节'], ['第二节'], ['语言知识运用'], ['第II卷'],
+                  ['第二部分'], ['第三部分'], ['第四部分']]
+
+chinese_filter = re.compile("^\s*[((]\s*[一二三四五六七八九十]\s*[))]\s*[\.、::]?")
+
+
+def contains_all(s, words):
+    for word in words:
+        if all([w in s for w in word]):
+            return True
+    return False
+
+
+def is_topic_skip(s, subject):
+    """判断该行是否可以去掉,跳跃节点"""
+    if subject == 'english':
+        return contains_all(s, english_filter)
+
+    elif subject in ['math', 'chinese', 'physics', 'chemistry', 'biology']:
+        if topic_filter.match(s):
+            return True
+        if subject == 'chinese':
+            if chinese_filter.match(s):
+                return True
+        for topic_type in general_filter:
+            if topic_type in s:
+                return True
+        return False
+    else:
+        raise ValueError("subject={} is not supported!".format(subject))
+
+
+# ----------------------action----------------------
+def group_pictures(pictures, subject=''):
+    """Assume pictures are row based"""
+    # texts = []
+    # for picture in pictures:
+    #     t = luo_ocr.ocr_py(picture)
+    #     t = t.replace("\r", "").replace("\n", "")
+    #     # print(str(t))
+    #     texts.append(str(t))
+    # # texts = bd_ocr(pictures)
+    # # print(texts)
+
+    texts = [luo_ocr.ocr_py(picture).replace("\r", "").replace("\n", "") for picture in pictures]
+
+    groups = []
+    start = 0
+    for i, t in enumerate(texts):
+        if is_topic_start(t, subject):
+            groups.append([start, i])
+            start = i
+        elif is_topic_end(t, subject):
+            groups.append([start, i + 1])
+            start = i + 1
+        elif is_topic_skip(t, subject):
+            if i > start:
+                groups.append([start, i])
+            start = i + 1
+
+    len_text = len(texts)
+    if len_text > start:
+        groups.append([start, len_text])
+    return texts, groups

+ 246 - 0
segment/ocr/group_text.py

@@ -0,0 +1,246 @@
+# @Author  : lightXu
+# @File    : group_text.py
+import re
+
+subjects = ['unknown_subject', 'math', 'math_zxhx', 'english',
+            'chinese', 'physics', 'chemistry',
+            'biology', 'politics', 'history', 'geography',
+            'science_comprehensive', 'arts_comprehensive']
+
+
+# -------------------------符合下列条件的则为开始---------------------------
+# general_start = re.compile("^\s*\d+\s*[\.、::]?\D|^\s*\d+\s*[\.、::]?\d{4}]")
+general_start = re.compile("^\s*\d+\s*[\.、::]\D|^\s*\d+\s*[\.、::]\d{4}")
+math_start = re.compile("^\s*\(\d+\)\s*[\.、::]?")
+chinese_start = re.compile("^\s*[((]\s*[一二三四五六七八九十]\s*[))]\s*[\.、::]?")
+write_start = re.compile("^\s*\(\d+\)\s*[\.、::]?")
+
+
+def is_topic_start(s, subject):
+    '''开始节点'''
+    if subject in subjects:
+        if general_start.match(s):
+            return True
+        if subject == 'math':
+            if "本题" in s or "本小题" in s:
+                return True
+            elif math_start.match(s):
+                return True
+        elif subject == 'chinese':
+            if chinese_start.match(s):
+                return True
+        # elif subject == 'english':
+        #     if '注意' in s or '内容包括' in s:
+        #         if write_start.search(s):
+        #             return False
+        return False
+    else:
+        raise ValueError("subject={} is not supported!".format(subject))
+
+
+# -------------------------符合下列条件的则为结束-------------------------
+general_end = re.compile("D\s*[\.、::]")
+# english_end = re.compile("^\s*G\s*[\.、::]")
+english_end = re.compile("^\s*[EFG]\s*[\.、::]|^\s*[EFG]\s+")
+chinese_end = re.compile("^\s*[EFG]\s*[\.、::]")
+written_expression = re.compile(r'书面表达')
+written_expression1 = re.compile(r'短文改错|翻译句子')
+
+
+def is_topic_end(s, subject):
+    '''结束节点'''
+    if subject in subjects:
+        if subject == 'english':
+            if general_end.search(s):
+                return True
+            if english_end.search(s):
+                return "G"
+
+            # elif written_expression.search(s):
+            #     return '书面表达'
+
+        if subject == 'chinese':
+            if chinese_end.search(s):
+                return True
+        return False
+    else:
+        raise ValueError("subject={} is not supported!".format(subject))
+
+
+# -------------------------符合下列条件的则为跳过舍去-------------------------
+general_filter1 = re.compile("^\s*[一二三四五六七八九十]+\s*[\.、::]")
+general_filter2 = ['选择题', '单选题', '多选题', '综合题', '答案无效', '题目要求',
+                   '填空题', '单空题', '多空题', '计算题', '演算步骤', '单元测试', '古代诗歌阅读',
+                   '解答题', '简答题', '证明题', '按要求填写下列空格', '单项选择题', '注意事项',
+                   '选做题', '实验题', '第II卷', '第Ⅱ卷', '一律得零分', '证明过程', '现代文阅读',
+                   '第二卷', '答题卡', '试卷满分', '选题人', '最佳选项', '填写结果', '选不全', '文言文阅读',
+                   '答题时间', '分值', '题目要求', '阅读下面文字', '阅读下面短文', '阅读下列短文',
+                   '甲必考题', '必考题', '读一遍', '题卡', '符合题目要求', '规定区域', '符合要求', '阅读下面']
+
+end_sign = re.compile("第\d+页|共\d+页|页\d+第|\d+第|第[((]\d+[))]页|共[((]\d+[))]页|共[((]\d+[))]页$")
+
+english_filter = [["听", "材料"], ["听", "对话"], ["听", "独白"],
+                  ['第二节'], ['语言知识运用'], ['第II卷'], ['录音'], ['作答时间'], ['选项'],
+                  ['第二部分'], ['第三部分'], ['第四部分'], ['第一部分'], ['第一节'], ['阅读下列短文'], ['短文'], ['阅读下面短文'], ['阅读']]
+
+
+def contains_all(s, words):
+    for word in words:
+        if all([w in s for w in word]):  # any()与all()函数的区别: any是任意,而all是全部。
+            return True                  # 空元组、空列表返回值为True,这里要特别注意
+    return False
+
+
+def is_topic_skip(s, subject):
+    '''判断该行是否可以去掉,跳跃节点'''
+
+    if subject in subjects:
+        if general_filter1.match(s):
+            return True
+        elif end_sign.search(s):
+            return True
+        for topic_type in general_filter2:
+            if topic_type in s:
+                return True
+        if subject == 'english':
+            return contains_all(s, english_filter)
+        return False
+    else:
+        raise ValueError("subject={} is not supported!".format(subject))
+
+
+# -----------------------all_end---------------------
+
+end_words = [["参考", "答案"], ["试题", "答案"], ["试卷", "答案"],
+             ["省", "学年"], ["省", "学期"], ["市", "学年"], ["市", "学期"]]
+
+
+def all_end(s, subject):
+    if subject not in ["chinese", 'english']:
+        for word in end_words:
+            if all([w in s for w in word]):
+                return True
+        return False
+    return False
+
+
+# ----------------------action----------------------
+def group_pictures1(abcd_texts, subject=''):
+    texts = abcd_texts['text']
+    groups = []
+    start = 0
+    text_end = 0
+    pattern = re.compile(r"[一]?[\.。、((::,,]?选择题[\.。、((::,,]|一[\.、((。::,,]?填空题[\.。、((::,,]|[一]?[\.。、((::,,]?单项选择题|[一]?[\.。、((::,,]?单项选择|[一]?[\.。、((::,,]?现代文阅读[\.。、((::,,]|[一]?[\.。、((::,,]?单选题[\.。、((::,,]")
+    pattern1 = re.compile(r'第I卷|第〡卷|第Ⅰ卷|第I卷阅读题|第一部分')
+
+    for i, t in enumerate(texts):
+        if pattern.match(t):
+            groups.clear()
+            start = i + 1
+        elif pattern1.match(t):
+            groups.clear()
+            start = i + 1
+
+        elif is_topic_start(t, subject):
+            groups.append([start, i])
+            start = i
+        elif is_topic_end(t, subject):
+            if is_topic_end(t, subject) == "G" and start != 0:
+                new_list = groups.pop()
+                new_start = new_list[0]
+                groups.append([new_start, i + 1])
+                start = i + 1
+
+            else:
+                groups.append([start, i + 1])
+                start = i + 1
+        elif is_topic_skip(t, subject):
+            if i > start:
+                groups.append([start, i])
+            start = i + 1
+        elif all_end(t, subject):
+            text_end = i
+
+    len_text = len(texts)
+    if len_text > start:
+        if text_end:
+            # print(text_end)
+            groups.append([start, text_end])
+        else:
+            # print(len_text)
+            groups.append([start, len_text])
+
+    for i, lst in enumerate(groups):
+        if lst[0] == lst[1]:
+            del groups[i]
+
+    # print('\n', groups)
+    return groups
+
+
+def segment(texts):
+    text_correct = re.compile(r'短文改错|翻译句子|书面表达')
+    seg_index = []
+    for i, t in enumerate(texts):
+        if text_correct.search(t):
+            seg_index.append(i)
+
+    if len(seg_index) < 1:
+        abcd_texts1 = {'start_index': 0, 'text': texts}
+        return abcd_texts1, []
+    else:
+        seg_index_number = min(seg_index)
+        abcd_texts1 = {'start_index': 0, 'text': texts[:seg_index_number]}
+        writing_texts2 = {'start_index': seg_index_number, 'text': texts[seg_index_number:]}
+
+        return abcd_texts1, writing_texts2
+
+
+def match_writing_section(texts, subject='english'):
+    if subject == 'english':
+        start_index = texts['start_index']
+        texts_content = texts['text']
+        text_correct = re.compile(r'短文改错|翻译句子|书面表达')
+        seg_index_list = []
+        for i, t in enumerate(texts_content):
+            if text_correct.search(t):
+                seg_index_list.append(i)
+
+        seg_index_list.append(len(texts_content))
+        seg_index_list = sorted(list(set(seg_index_list)))
+
+        groups_list = []
+        if len(seg_index_list) == 1 and seg_index_list[0] == 0:
+            pass
+        for i, number in enumerate(seg_index_list[1:]):
+            groups_list.append([seg_index_list[i]+start_index+1, number+start_index])
+
+        # print(groups_list)
+        return groups_list
+    else:
+        return []
+
+
+def group_text(all_texts, subject):
+    # txt_path = r'F:\nine_subject\english_test\write\57.txt'
+    # text = open(txt_path, 'r').readlines()
+    if subject == 'english':
+        abcd_sec, writing_sec = segment(all_texts)
+        if len(writing_sec) > 0:
+            abcd_list = group_pictures1(abcd_sec, subject)
+            writing_list = match_writing_section(writing_sec, subject)
+            group_list = abcd_list + writing_list
+        else:
+            group_list = group_pictures1(abcd_sec, subject)
+    else:
+        all_texts = {'text': all_texts}
+        group_list = group_pictures1(all_texts, subject)
+    return group_list
+
+
+if __name__ == '__main__':
+    subject = 'english'
+    txt_path = r'G:\write\112.txt'
+    all_texts = open(txt_path, 'r').readlines()
+    group_list = group_text(all_texts, subject)
+    print(group_list)

+ 0 - 0
segment/ocr/luo_ocr/__init__.py


+ 64 - 0
segment/ocr/luo_ocr/ocr.py

@@ -0,0 +1,64 @@
+from . import preprocess
+from . import sheetocr
+import time
+import os
+import cv2
+
+# sheetpath = r'C:\Users\Administrator\Desktop\sheet'  # 预处理前的试卷目录
+# testpath = r'C:\Users\Administrator\Desktop\test'  # 预处理后的试卷目录
+# resultpath = r'C:\Users\Administrator\Desktop\result'  # 结果生成目录
+
+# Parameter Sets
+langs = {'ce': '-l chi_sim+eng', 'ec': '-l eng+chi_sim', 'c': '-l chi_sim', 'e': '-l eng', 'eq': '-l eng+equ'}  # 语言选项
+psms = {'block': '--psm 6', 'default': '--psm 3'}  # , '_line': ' --psm 7'}  # Page segmentation modes
+# oems = {'legacy': '--oem 0', 'lstm': '--oem 1', 'lstm+legacy': '--oem 2'}  # OCR Engine modes
+langs_py = {'ce': 'chi_sim+eng', 'ec': 'eng+chi_sim', 'c': 'chi_sim', 'e': 'eng', 'eq': 'eng+equ'}  # 语言选项
+psms_py = {'block': '--psm 6', 'default': '--psm 3'}  # , '_line': ' --psm 7'}  # Page segmentation modes
+scales = (0, 0.5, 2)
+dilates = (0, 1, 3, 5)
+blurs = (0, 1, 3, 5, 7)
+
+
+# 用默认最佳参数处理图片, 返回文本
+def ocr_py(picture, lang='ce', psm='block', scale=0, dilate=1, blur=5):
+    image = preprocess.preprocess(picture, scale=scale, dilate=dilate, blur=blur)
+    words = sheetocr.sheetocr_py(image, lang=langs_py[lang], psm=psms_py[psm])
+    return words
+
+
+# 用默认最佳参数处理图片, 返回文本文件
+def ocr(picture, output, lang='ce', psm='block', scale=0, dilate=1, blur=5):
+    image = preprocess.preprocess(picture, scale=scale, dilate=dilate, blur=blur)
+    cv2.imwrite('tmp_pic', image)
+    sheetocr.sheetocr('tmp_pic', output, lang=langs[lang], psm=psms[psm])
+    os.remove('tmp_pic')
+
+
+# 测试最佳参数
+def test_parameters(picture_path, output=0):
+    start = time.time()
+    for root, dirs, files in os.walk(picture_path):
+        for file in files:
+            picture = os.path.join(root, file)
+            if output == 0:  # 屏幕显示
+                for s in scales:
+                    for d in dilates:
+                        for b in blurs:
+                            print('Parameters:' + 's' + str(s) + 'd' + str(d) + 'b' + str(b) + '\n')
+                            words = ocr_py(picture, scale=s, dilate=d, blur=b)
+                            print(words)
+            else:  # 输出到路径为output的文件中
+                for s in scales:
+                    for d in dilates:
+                        for b in blurs:
+                            save = file + 's' + str(s) + 'd' + str(d) + 'b' + str(b)
+                            save = os.path.join(output, save)
+                            with open(save, 'r', encoding='UTF-8') as f:
+                                words = ocr_py(picture, scale=s, dilate=d, blur=b)
+                                f.write(words)
+    end = time.time()
+    print('running time:', end - start, 's')
+
+
+# test_parameters(sheetpath)
+# print('OCR done!\n')

+ 85 - 0
segment/ocr/luo_ocr/preprocess.py

@@ -0,0 +1,85 @@
+"""
+图像预处理,提高OCR识别率
+1.  图像光照均匀性分析,获取图像光照分布map;
+
+2.  图像边缘分析、得到图像的边缘分布模型,得到总体的边缘度量权值w1;
+
+3.  图像模糊度计算,得到图像模糊权值w2.
+
+4.  对图像进行局部分块处理,利用图像对应分块的map特征、w1、w2的权值,得到图像每个分块的局部二值结果。
+
+5.  对整幅图像局部二值化结果进行空白填充处理防止字符断裂,对分割的游离点进行分析剔除异常噪点
+
+6.  直线剔除。
+
+Scaling To The Right Size
+
+Ensure that the images are scaled to the right size which usually is of at least 300 DPI (Dots Per Inch). Keeping DPI
+lower than 200 will give unclear and incomprehensible results while keeping the DPI above 600 will unnecessarily
+increase the size of the output file without improving the quality of the file. Thus, a DPI of 300 works best for this
+purpose.
+
+Increase Contrast
+
+Low contrast can result in poor OCR. Increase the contrast and density before carrying out the OCR process. This can be
+done in the scanning software itself or in any other image processing software. Increasing the contrast between the
+text/image and its background brings out more clarity in the output.
+
+Binarize Image
+
+This step converts a multicolored image (RGB) to a black and white image. There are several algorithms to convert a
+color image to a monochrome image, ranging from simple thresholding to more sophisticated zonal analysis.
+
+Remove Noise and Scanning Artefacts
+
+Noise can drastically reduce the overall quality of the OCR process. It can be present in the background or foreground
+and can result from poor scanning or the poor original quality of the data.
+
+Deskew
+
+This may also be referred to as rotation. This means de-skewing the image to bring it in the right format and right
+shape. The text should appear horizontal and not tilted in any angle. If the image is skewed to any side, deskew it by
+rotating it clockwise or anti clockwise direction.
+
+Layout Analysis (or Zone Analysis)
+
+In order to detect words correctly, it is important to first recognize the zones or the layout (which are also the areas
+of interest). This step detects the paragraphs, tables, columns, captions of the images etc. If the software misses out
+on any zone or layout, words might be cut in half or not detected at all.
+"""
+
+import cv2
+import numpy as np
+from segment.image_operation import utils
+
+
+# 读取图片,生成预处理的图像
+def preprocess(picture, scale, dilate, blur, show=False):
+    # 预处理图像
+    img = utils.read_img(picture)
+    # rescale the image
+    if scale != 0:
+        img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+
+    # Convert to gray
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    # Apply dilation and erosion to remove some noise
+    if dilate != 0:
+        kernel = np.ones((dilate, dilate), np.uint8)
+        img = cv2.dilate(img, kernel, iterations=1)
+        img = cv2.erode(img, kernel, iterations=1)
+
+    # Apply blur to smooth out the edges
+    if blur != 0:
+        img = cv2.GaussianBlur(img, (blur, blur), 0)
+
+    # Apply threshold to get image with only b&w (binarization)
+    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
+
+    if show:
+        # cv2.namedWindow('image', cv2.WINDOW_NORMAL)
+        cv2.imshow('image', img)
+        cv2.waitKey(0)
+        cv2.destroyAllWindows()
+    return img

+ 67 - 0
segment/ocr/luo_ocr/sheetocr.py

@@ -0,0 +1,67 @@
+'''
+使用Tesseract 对试卷做OCR
+
+Tesseract Usage:
+  tesseract --help | --help-extra | --help-psm | --help-oem | --version
+  tesseract --list-langs [--tessdata-dir PATH]
+  tesseract --print-parameters [options...] [configfile...]
+  tesseract imagename|imagelist|stdin outputbase|stdout [options...] [configfile...]
+
+OCR options:
+  --tessdata-dir PATH   Specify the location of tessdata path.
+  --user-words PATH     Specify the location of user words file.
+  --user-patterns PATH  Specify the location of user patterns file.
+  -l LANG[+LANG]        Specify language(s) used for OCR.
+  -c VAR=VALUE          Set value for config variables.
+                        Multiple -c arguments are allowed.
+  --psm NUM             Specify page segmentation mode.
+  --oem NUM             Specify OCR Engine mode.
+NOTE: These options must occur before any configfile.
+
+Page segmentation modes:
+  0    Orientation and script detection (OSD) only.
+  1    Automatic page segmentation with OSD.
+  2    Automatic page segmentation, but no OSD, or OCR.
+  3    Fully automatic page segmentation, but no OSD. (Default)
+  4    Assume a single column of text of variable sizes.
+  5    Assume a single uniform block of vertically aligned text.
+  6    Assume a single uniform block of text.
+  7    Treat the image as a single text line.
+  8    Treat the image as a single word.
+  9    Treat the image as a single word in a circle.
+ 10    Treat the image as a single character.
+ 11    Sparse text. Find as much text as possible in no particular order.
+ 12    Sparse text with OSD.
+ 13    Raw line. Treat the image as a single text line,
+       bypassing hacks that are Tesseract-specific.
+
+OCR Engine modes:
+  0    Legacy engine only.
+  1    Neural nets LSTM engine only.
+  2    Legacy + LSTM engines.
+  3    Default, based on what is available.
+
+Single options:
+  -h, --help            Show minimal help message.
+  --help-extra          Show extra help for advanced users.
+  --help-psm            Show page segmentation modes.
+  --help-oem            Show OCR Engine modes.
+  -v, --version         Show version information.
+  --list-langs          List available languages for tesseract engine.
+  --print-parameters    Print tesseract parameters.
+
+'''
+import os
+import pytesseract
+
+
+# ocr图片文件,生成文本文件,较好的参数为 -l chi_sim+eng --psm 6
+def sheetocr(picture, output, lang, psm):
+    cmd = 'tesseract' + ' ' + picture + ' ' + output + ' ' + lang + ' ' + psm
+    os.system(cmd)
+
+
+# ocr 图像,生成文本,较好的参数为'chi_sim+eng',  '--psm 6'
+def sheetocr_py(img, lang, psm):
+    words = pytesseract.image_to_string(img, lang=lang, config=psm)
+    return words

+ 144 - 0
segment/ocr/penguin_ocr.py

@@ -0,0 +1,144 @@
+# @Author  : lightXu
+# @File    : penguin_ocr.py
+# @Time    : 2019/6/11 0011 下午 17:52
+import base64
+import hashlib
+import random
+import string
+import time
+from urllib.parse import urlencode
+
+import cv2
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+from segment.image_operation.utils import resize_by_percent, write_single_img
+
+APP_KEY = 'R2iPkd5J2056YFRw'
+APP_ID = '2117302084'
+
+
+def opecv2base64(img):
+    image = cv2.imencode('.jpg', img)[1]
+    base64_data = str(base64.b64encode(image))[2:-1]
+    return base64_data
+
+
+def get_base64_size(base64_str):
+    length = len(base64_str)
+    size = float(length - 2 * length/8)  # byte
+    return size
+
+
+def opecv2base64_stand(raw_image, mem_size, default_size=1):  # 小于1M
+    default_size = default_size * 1000 * 1000
+    m_ratio = mem_size/default_size
+    if m_ratio > 1.0:
+        y, x = raw_image.shape[0], raw_image.shape[1]
+        s_ratio = max(y, x) / 1200
+        ratio = max(m_ratio, s_ratio)
+        image_resize = resize_by_percent(raw_image, 1/ratio)
+        # img_gray = cv2.cvtColor(raw_image, cv2.COLOR_RGB2GRAY)
+        write_single_img(image_resize, r'C:\Users\Administrator\Desktop\p\01_r.jpg')
+        return opecv2base64(image_resize)
+    else:
+        return opecv2base64(raw_image)
+
+
+def _get_sign(params, app_key):
+    sort_dict = sorted(params.items(), key=lambda item: item[0], reverse=False)
+    sort_dict.append(('app_key', app_key))
+    rawtext = urlencode(sort_dict).encode()
+    sha = hashlib.md5()
+    sha.update(rawtext)
+    md5text = sha.hexdigest().upper()
+
+    return md5text
+
+
+def please_retry(response, url, data, headers):
+    status_code = response.status_code
+    if status_code == 200:
+        resp = response.json()
+        if 'ok' != resp.get('msg'):
+
+            try_iter = 0
+            while try_iter < 3:
+                response = requests.post(url, data=data, headers=headers, timeout=15)
+
+            print(resp)
+
+
+def get_ocr_english_text_raw_format(img, size):
+    url = 'https://api.ai.qq.com/fcgi-bin/ocr/ocr_generalocr'
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+    image_base64 = opecv2base64_stand(img, size)  # 得到 base64 编码的数据
+    nonce_str = ''.join(random.sample(string.ascii_letters + string.digits, 10))
+    data = {
+        'app_id': APP_ID,
+        'image': image_base64,
+        'time_stamp': str(int(time.time())),
+        'nonce_str': nonce_str,
+    }
+
+    sign = _get_sign(data, APP_KEY)
+    data['sign'] = sign
+
+    s = requests.Session()
+    retries = Retry(total=3,
+                    backoff_factor=0.1,
+                    status_forcelist=[500, 502, 503, 504])
+    s.mount('https://', HTTPAdapter(max_retries=retries))
+
+    # response = requests.post(url, data=data, headers=headers)
+    response = s.post(url, data=data, headers=headers, timeout=15)
+
+    final_response = ''
+    for _ in range(0, 3):
+        status_code = response.status_code
+        if status_code == 200 and 'ok' == response.json().get('msg'):
+            final_response = response
+            # print('ok')
+            break
+        else:
+            response = s.post(url, data=data, headers=headers, timeout=15)
+            # print('retry')
+
+    if final_response:
+        status_code = final_response.status_code
+        if status_code == 200:
+            resp = response.json()
+            if 'ok' != resp.get('msg'):
+                # print(resp)
+                raise Exception("ocr error {}: {}!".format(resp.get('ret'), resp.get('msg')))
+            # print(resp)
+        else:
+            raise ValueError('ocr failed, response[{}]'.format(status_code))
+    else:
+        raise ValueError('ocr failed, retried three times while no response')
+
+    return resp
+
+
+def ocr_format(resp):
+    item_list = resp['data']['item_list']
+    words_str_list = []
+    for item_index, item in enumerate(item_list):
+        words_list = item['words']
+        words_str = ''
+        for char_index, char_dict in enumerate(words_list):
+            char = char_dict['character']
+            if char == '':
+                char = ' '
+            words_str = words_str + char
+        words_str_list.append(words_str.lstrip())
+
+    return words_str_list
+
+
+def get_ocr_english_text(image, size):
+    resp = get_ocr_english_text_raw_format(image, size)
+    words_list = ocr_format(resp)
+    return words_list
+

+ 36 - 0
segment/ocr/split_topic_en.py

@@ -0,0 +1,36 @@
+inf_words_dict = dict()
+with open("./segment/ocr/type_config.txt", "r", encoding="utf-8") as f:
+    for i, line in enumerate(f):
+        if line.startswith("#"):
+            continue
+        line = line.strip().replace(":", ":").replace(",", ",")
+        key, val = line.split(":")
+        key = key.strip()
+        val = val.split(",")
+        val = tuple(v.strip() for v in val)
+        inf_words_dict[val] = key
+
+# 答案冒号 = "答案:"
+# 解析冒号 = "解析:"
+
+
+def could_skip_line(line):
+    '''对于答案和解析行,不进行type_inf'''
+    return line.startswith("答案:") or line.startswith("解析:")
+
+
+def contains_all(s, words):
+    return all([w in s for w in words])
+
+
+def topic_type_line(line):
+    if could_skip_line(line):
+        return False
+    for key, val in inf_words_dict.items():
+        if contains_all(line, key):
+            return True
+    return False
+
+
+
+

+ 165 - 0
segment/ocr/type_config.txt

@@ -0,0 +1,165 @@
+考试听力:听力
+考试听力:听,录音
+考试听力:听,材料
+考试听力:听,对话
+考试听力:听,短文
+考试听力: 听,独白
+考试听力:听力理解
+#考试听力:listening
+考试听力:hear, recording
+考试听力:hear, recordings
+考试听力:hear, material
+考试听力:hear, materials
+考试听力:hear, conversation
+考试听力:hear, conversations
+考试听力:hear, passage
+考试听力:hear, passages
+考试听力:hear, monologue
+考试听力:hear, monologues
+考试听力:listening, comprehension
+单项填空: 单项填空
+单项填空: 单项选择
+单项填空: 单选
+完形填空: 完形填空
+完形填空: 完型填空
+#完形填空: cloze
+完形填空: cloze test
+阅读理解: 阅读理解
+阅读理解: reading comprehension
+七选五: 七选五
+七选五: 七个选项
+七选五: 两项,多余
+七选五: 两项,多余选项
+七选五: 两项,多于选项
+七选五: 两个,多余
+七选五: 两个,多余选项
+七选五: 两个,多于选项
+七选五: seven options
+七选五: two items, surplus
+语法填空: 语法填空
+语法填空: 短文, 适当形式
+语法填空:短文, 正确形式
+语法填空:材料, 适当形式
+语法填空:材料, 正确形式
+语法填空:passage, proper form
+语法填空:passage, correct form
+语法填空:material, proper form
+语法填空:material, correct form
+语法填空:grammar and vocabulary
+语法填空:vocabulary and grammar
+选词填空: 选词填空
+选词填空: proper word, box
+任务型阅读: 任务型阅读
+任务型阅读: 任务型读写
+任务型阅读: task-based reading
+任务型阅读: task-based writing
+阅读表达: 阅读表达
+阅读表达: 阅读, 表达
+阅读表达: reading expression
+短文改错: 短文改错
+短文改错:单句改错
+#短文改错: 改,短文,错误
+#短文改错: 改,句子,错误
+#短文改错: 多余的词, 划掉
+#短文改错:错误, 修改
+#短文改错:改正, 错误
+#短文改错: text, correction
+#短文改错: essay, correction
+#短文改错:sentence, correction
+#短文改错:sentences, correction
+#短文改错: change, error
+#短文改错: change, errors
+#短文改错: change, mistake
+#短文改错: change, mistakes
+#短文改错:correct, mistake
+#短文改错:correct, mistakes
+#短文改错:correct, error
+#短文改错:correct, errors
+短文改错:modify, mistake
+短文改错:modify, mistakes
+短文改错:modify, errors
+短文改错:modify, error
+单词拼写: 单词拼写
+#单词拼写: 首字母
+单词拼写: 首字母, 汉语
+单词拼写: 首字母, 内容
+#单词拼写: 单词, 中文
+单词拼写: word spelling
+单词拼写: initial letter
+单词拼写: initial letter, chinese
+单词拼写: initial letter, english
+单词拼写: initial letter, content
+课文填空: 课文填空
+课文填空: 课文, 填空
+课文填空: 课文, 填, 内容
+课文填空: 课文, 内容, 完成, 句子
+#课文填空: recitation
+课文填空: complete, sentence
+课文填空: complete, sentences
+#句子翻译:翻译
+句子翻译: 句子,翻译
+句子翻译: 短文,翻译
+句子翻译:汉, 译, 英
+句子翻译:英, 译, 汉
+#句子翻译:translation
+句子翻译:sentence translation
+句子翻译:sentences translation
+句子翻译:English, Chinese, translation
+句子翻译:Chinese, English, translation
+句子翻译:English to Chinese
+句子翻译:Chinese to English
+句子翻译:Chinese, translate, English
+句子翻译:English, translate, Chinese
+书面表达: 书面表达
+书面表达: 应用文
+书面表达: 情景作文
+#书面表达: 写一篇短文
+#书面表达:写一封信
+#书面表达:回信
+#书面表达:申请信
+书面表达: 写作
+书面表达: 作文
+#书面表达: 开头, 总词数
+#书面表达: 100, 字
+#书面表达:120, 字
+#书面表达:150, 字
+#书面表达:180, 字
+#书面表达:200, 字
+#书面表达: 100, 词
+#书面表达:120, 词
+#书面表达:150, 词
+#书面表达:180, 词
+#书面表达:200, 词
+#书面表达: 开头, 写好
+#书面表达: 开头, 写出
+#书面表达:词, 左右
+#书面表达:文章, 通顺
+#书面表达:文章, 连贯
+书面表达:writing
+书面表达:guided writing
+#书面表达:summary writing
+#书面表达:practical writing
+#书面表达:composition
+#书面表达:situational composition
+#书面表达:write, essay
+#书面表达:write, letter
+#书面表达:application letter
+#书面表达:100 words
+#书面表达:120 words
+#书面表达:150 words
+#书面表达:180 words
+#书面表达:200 words
+完成句子: 完成句子
+完成句子: 完成, 句子
+完成句子: 完整句子
+完成句子: 完成, 各句
+完成句子: finish, sentence
+完成句子: finish, sentences
+完成句子: complete, sentence
+完成句子: complete, sentences
+信息匹配:信息匹配
+句型转换:句型转换
+句型转换:转换句型
+单词辨音: 单词辨音
+单词辨音:单词,音标
+单词辨音:单词,读音

+ 799 - 0
segment/server.py

@@ -0,0 +1,799 @@
+import base64
+import glob
+import os
+import time
+import uuid
+import shutil
+import xml.etree.cElementTree as ET
+from urllib import parse, request
+
+import cv2
+import numpy as np
+import pypinyin
+import requests
+from PIL import Image
+from django.conf import settings
+from pdf2image import convert_from_path
+
+import segment.logging_config as logging
+from segment.image_operation.exam_segment import get_page_text
+from segment.image_operation.pre_segment import segment2parts
+from segment.image_operation.segment import joint_image
+from segment.image_operation.split_lines import line_split
+from segment.image_operation.utils import create_xml, resize_by_percent
+from segment.image_operation.utils import write_single_img
+from segment.models import OcrToken
+from segment.ocr.group_pictures import group_pictures
+from segment.ocr.group_text import group_text
+from segment.ocr.penguin_ocr import get_ocr_english_text
+
+logger = logging.getLogger(settings.LOGGING_TYPE)
+
+
+def convert_pil_to_jpeg(raw_img):
+    if raw_img.mode == 'L':
+        channels = raw_img.split()
+        img = Image.merge("RGB", (channels[0], channels[0], channels[0]))
+    elif raw_img.mode == 'RGB':
+        img = raw_img
+    elif raw_img.mode == 'RGBA':
+        img = Image.new("RGB", raw_img.size, (255, 255, 255))
+        img.paste(raw_img, mask=raw_img.split()[3])  # 3 is the alpha channel
+    else:
+        img = raw_img
+    open_cv_image = np.array(img)
+    return img, open_cv_image
+
+
+def opencv2base64(img):
+    image = cv2.imencode('.jpg', img)[1]
+    base64_data = str(base64.b64encode(image))[2:-1]
+    return base64_data
+
+
+def get_dir_next_index_name(path, file_type):
+    files_list = os.listdir(path)
+    imgs_list = [file.replace(file_type, '') for file in files_list if file.endswith(file_type)]
+
+    length = len(imgs_list)
+    if length == 0:
+        return 1
+    else:
+        index_name = max(imgs_list)
+        return int(index_name) + 1
+
+
+def save_raw_image(subject, datetime, img_file, analysis_type):
+    # 随机生成新的图片名,自定义路径。
+    ext = img_file.name.split('.')[-1]
+    raw_name = img_file.name[0:-len(ext) - 1]
+    file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], 'jpg')
+
+    raw_img = Image.open(img_file)  # 读取上传的网络图像
+    save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    save_path = os.path.join(save_dir, file_name)
+
+    channels = raw_img.split()
+    if len(channels) >= 3:
+        img = Image.merge("RGB", (channels[0], channels[1], channels[2]))
+        open_cv_image = np.array(img)
+        img_reload = open_cv_image[:, :, ::-1].copy()
+        parts_list = segment2parts(img_reload, save_path)
+    else:
+        img = raw_img
+        open_cv_image = np.array(img)
+        parts_list = segment2parts(open_cv_image, save_path)
+
+    # for part in parts_list:
+    #     with open(part['img_part'], 'rb') as f:
+    #         bin_img = f.read()
+    #         part['img_part'] = bin_img
+
+    try:
+        img.save(save_path)
+    except Exception as e:
+        raise e
+
+    url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
+    return save_path, parts_list, url_path
+
+
+def save_raw_image_without_segment(subject, datetime, img_file, analysis_type):
+    # 随机生成新的图片名,自定义路径。
+    ext = img_file.name.split('.')[-1]
+    raw_name = img_file.name[0:-len(ext) - 1]
+    file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], 'jpg')
+
+    raw_img = Image.open(img_file)  # 读取上传的网络图像
+    save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    save_path = os.path.join(save_dir, file_name)
+
+    pil_img, open_cv_image = convert_pil_to_jpeg(raw_img)
+    try:
+        pil_img.save(save_path)
+        shutil.copy(save_path, save_path.replace('.jpg', '_small.jpg'))
+    except Exception as e:
+        raise e
+
+    url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
+    return save_path, open_cv_image, url_path
+
+
+def save_raw_image_without_segment_formula(subject, datetime, img_file, analysis_type):
+    # 随机生成新的图片名,自定义路径。
+    ext = img_file.name.split('.')[-1]
+    raw_name = img_file.name[0:-len(ext) - 1]
+    file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], ext)
+
+    raw_img = Image.open(img_file)  # 读取上传的网络图像
+    save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
+
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    save_path = os.path.join(save_dir, file_name)
+
+    channels = raw_img.split()
+    # if ext == 'png' and len(channels) >= 3:  # 公式ocr分割透明png
+    #     img = Image.merge("RGB", (channels[0], channels[1], channels[2]))
+    #     open_cv_image = np.array(img)
+    #     resize_img = resize_by_percent(open_cv_image, 0.5)
+    #
+    # else:
+    #     img = raw_img
+    #     open_cv_image = np.array(img)
+    #     resize_img = resize_by_percent(open_cv_image, 0.5)
+
+    try:
+        raw_img.save(save_path)
+        # write_single_img(resize_img, save_path.replace('.jpg', '_small.jpg'))
+    except Exception as e:
+        raise e
+
+    url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
+    return save_path, url_path, raw_img
+
+
+def save_raw_image_in_jpeg(subject, datetime, img_file, analysis_type):
+    # 随机生成新的图片名,自定义路径。
+    ext = img_file.name.split('.')[-1]
+    raw_name = img_file.name[0:-len(ext) - 1]
+    file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], 'jpg')
+
+    raw_img = Image.open(img_file)  # 读取上传的网络图像
+    save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
+
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    save_path = os.path.join(save_dir, file_name)
+
+    if raw_img.mode == 'L':
+        channels = raw_img.split()
+        img = Image.merge("RGB", (channels[0], channels[0], channels[0]))
+    elif raw_img.mode == 'RGB':
+        img = raw_img
+    elif raw_img.mode == 'RGBA':
+        img = Image.new("RGB", raw_img.size, (255, 255, 255))
+        img.paste(raw_img, mask=raw_img.split()[3])  # 3 is the alpha channel
+    else:
+        img = raw_img
+    open_cv_image = np.array(img)
+    # resize_img = resize_by_percent(open_cv_image, 0.5)
+
+    try:
+        img.save(save_path)
+        # write_single_img(resize_img, save_path.replace('.jpg', '_small.jpg'))
+    except Exception as e:
+        raise e
+
+    url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
+    return save_path, url_path, open_cv_image
+
+
+def ocr_login():
+    def login():
+        grant_type = 'client_credentials'
+        client_id = settings.OCR_CLIENT_ID
+        client_secret = settings.OCR_CLIENT_SECRET
+
+        textmod = {'grant_type': grant_type, 'client_id': client_id, 'client_secret': client_secret}
+        textmod = parse.urlencode(textmod)
+
+        # 输出内容:user=admin&password=admin
+        header_dict = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'}
+        url = 'https://aip.baidubce.com/oauth/2.0/token'
+        req = request.Request(url='{}{}{}'.format(url, '?', textmod), headers=header_dict)
+        res = request.urlopen(req).read()
+        token = eval(res.decode(encoding='utf-8'))['access_token']
+        lastest_access_token = OcrToken(access_token=token)
+        lastest_access_token.save()
+        return token
+
+    objects = OcrToken.objects.latest('update_time')
+    lastest_access_token_db = objects.access_token
+    lastest_date = objects.update_time
+    ans_time = time.mktime(lastest_date.timetuple())
+    update_date = settings.OCR_TOKEN_UPDATE_DATE
+
+    current_time = time.time()
+
+    if (ans_time + update_date * 24 * 60 * 60) > current_time:
+        return lastest_access_token_db
+    else:
+        return login()
+
+
+def get_exam_bbox_by_tesseract(img_raw_name, img_path, subject):
+    error_info = ''
+    status = 1
+    text = []
+
+    lines_save_dir = img_path.replace('.jpg', '_lines')
+
+    img_path = os.path.abspath(img_path)
+    lines_save_dir = os.path.abspath(lines_save_dir)
+    if not os.path.exists(lines_save_dir):
+        os.makedirs(lines_save_dir)
+    start_time = time.time()
+    try:
+        bbox, lines_abs_path_list = line_split(img_path, lines_save_dir, settings.TOLERANCE_PIX_NUMBER)  # 分行
+    except Exception as e:
+        logger.error('line_split failed: {}'.format(e), exc_info=True)
+        status = 0
+        error_info = str(e)
+
+        info = {'is_success': status, 'img_name': img_raw_name, 'coordinate': text, 'error': error_info}
+        return info
+
+    time1 = time.time()
+    logger.info('lines_segment, cost: {}'.format(time1 - start_time))
+    exam_group = []
+
+    try:
+        _, exam_group = group_pictures(lines_abs_path_list, subject)
+        logger.info('exam_group info : {}'.format(exam_group))
+    except (SystemExit, KeyboardInterrupt):
+        raise
+    except Exception as e:
+        logger.error('ocr failed: {}'.format(e), exc_info=True)
+        status = 0
+        error_info = error_info + str(e)
+
+    time2 = time.time()
+    logger.info('exam_grouped, cost: {}'.format(time2 - time1))
+
+    try:
+        text = joint_image(img_path, bbox, exam_group)
+    except (SystemExit, KeyboardInterrupt):
+        raise
+    except Exception as e:
+        logger.error('generate coordinate info failed: {}'.format(e), exc_info=True)
+        status = 0
+        error_info = error_info + str(e)
+
+    info = {'img_name': img_raw_name, 'coordinate': text}
+    if error_info:
+        info = {'img_name': img_raw_name, 'coordinate': text, 'error': error_info}
+
+    logger.info('{} done'.format(img_raw_name))
+    return status, info
+
+
+def get_ocr_text(access_token, img, subject=None):
+    textmod = {'access_token': access_token}
+    textmod = parse.urlencode(textmod)
+
+    url = '{}{}{}{}{}'.format(settings.OCR_URL, settings.OCR_ACCURACY, '_basic', '?', textmod)
+    url_general = '{}{}{}{}{}'.format(settings.OCR_URL, 'general', '_basic', '?', textmod)
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    # image = opecv2base64(img)  # 得到 byte 编码的数据
+    image = img
+
+    data = {
+        'image': image,
+        'detect_direction': 'true',
+        'language_type': 'CHN_ENG',
+    }
+
+    if subject == 'english':
+        resp = requests.post(url, data=data, headers=headers).json()
+    else:
+        resp = requests.post(url, data=data, headers=headers).json()
+    if resp.get('error_msg'):
+        if 'internal error' in resp.get('error_msg'):
+            resp = requests.post(url_general, data=data, headers=headers).json()
+            if resp.get('error_msg'):
+                raise Exception("ocr {}!".format(resp.get('error_msg')))
+        else:
+            raise Exception("ocr {}!".format(resp.get('error_msg')))
+
+    words_result = resp.get('words_result')
+    text_list = [word.get('words') for word in words_result]
+    # words_list = {'word': text_list, 'subject': subject}
+    return text_list
+
+
+def get_ocr_text_and_coordinate_in_raw_format(access_token, img):
+    textmod = {'access_token': access_token}
+    textmod = parse.urlencode(textmod)
+    url = '{}{}{}{}'.format(settings.OCR_BOX_URL, settings.OCR_ACCURACY, '?', textmod)
+    url_general = '{}{}{}{}'.format(settings.OCR_BOX_URL, 'general', '?', textmod)
+
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    image_type = 'base64'
+    group_id = 'group001'
+    user_id = 'usr001'
+
+    # image = base64.b64encode(img)  # 得到 byte 编码的数据
+    image = img
+
+    data = {
+        'image_type': image_type,
+        'group_id': group_id,
+        'user_id': user_id,
+        'image': image,
+        'detect_direction': 'true',
+        'recognize_granularity': 'small',
+        # 'vertexes_location': 'true',
+        # 'probability': 'true'
+    }
+
+    resp = requests.post(url, data=data, headers=headers).json()
+    if resp.get('error_msg'):
+        if 'internal error' in resp.get('error_msg'):
+            resp = requests.post(url_general, data=data, headers=headers).json()
+            if resp.get('error_msg'):
+                raise Exception("ocr {}!".format(resp.get('error_msg')))
+        else:
+            raise Exception("ocr {}!".format(resp.get('error_msg')))
+
+    return resp
+
+
+def get_ocr_text_and_coordinate(access_token, img):
+    textmod = {'access_token': access_token}
+    textmod = parse.urlencode(textmod)
+    url = '{}{}{}{}'.format(settings.OCR_BOX_URL, settings.OCR_ACCURACY, '?', textmod)
+    url_general = '{}{}{}{}'.format(settings.OCR_BOX_URL, 'general', '?', textmod)
+
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    image_type = 'base64'
+    group_id = 'group001'
+    user_id = 'usr001'
+
+    # image = base64.b64encode(img)  # 得到 byte 编码的数据
+    image = img
+
+    data = {
+        'image_type': image_type,
+        'group_id': group_id,
+        'user_id': user_id,
+        'image': image,
+        'detect_direction': 'true',
+        # 'recognize_granularity': 'small',
+        # 'vertexes_location': 'true',
+        # 'probability': 'true'
+    }
+
+    resp = requests.post(url, data=data, headers=headers).json()
+    if resp.get('error_msg'):
+        if 'internal error' in resp.get('error_msg'):
+            resp = requests.post(url_general, data=data, headers=headers).json()
+            if resp.get('error_msg'):
+                raise Exception("ocr {}!".format(resp.get('error_msg')))
+        else:
+            raise Exception("ocr {}!".format(resp.get('error_msg')))
+
+    words_result = resp.get('words_result')
+    text_list = [word.get('words') for word in words_result]
+    # words_list = {'word': text_list, 'subject': subject}
+    matrix_lt, matrix_rb = resolve_json(words_result)
+    return text_list, matrix_lt, matrix_rb
+
+
+def get_ocr_text_and_coordinate_formula(img, access_token, base64=False):
+    textmod = {'access_token': access_token}
+    textmod = parse.urlencode(textmod)
+    url = '{}{}{}{}'.format(settings.OCR_BOX_URL, settings.OCR_ACCURACY, '?', textmod)
+    url_general = '{}{}{}{}'.format(settings.OCR_BOX_URL, 'general', '?', textmod)
+
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    image_type = 'base64'
+    group_id = 'group001'
+    user_id = 'usr001'
+
+    if base64:
+        image = img
+    else:
+        image = opencv2base64(img)
+
+    data = {
+        'image_type': image_type,
+        'group_id': group_id,
+        'user_id': user_id,
+        'image': image,
+        'detect_direction': 'true',
+        'recognize_granularity': 'small',
+        'language_type': 'CHN_ENG',
+        # 'vertexes_location': 'true',
+        # 'probability': 'true'
+    }
+
+    resp = requests.post(url, data=data, headers=headers).json()
+    if resp.get('error_msg'):
+        if 'internal error' in resp.get('error_msg'):
+            resp = requests.post(url_general, data=data, headers=headers).json()
+            if resp.get('error_msg'):
+                raise Exception("ocr {}!".format(resp.get('error_msg')))
+        else:
+            raise Exception("ocr {}!".format(resp.get('error_msg')))
+
+    words_result = resp.get('words_result')
+    return words_result
+
+
+def resolve_json(words_result):
+    box_list = [item[key] for item in words_result for key in item if key == 'location']
+    matrix = np.array([0, 0, 0, 0])
+    for box in box_list:
+        # num_list = list(box.values())
+        w = box.get('width')
+        l = box.get('left')
+        t = box.get('top')
+        h = box.get('height')
+        num_list = [w, t, l, h]
+        matrix = np.vstack([matrix, np.array(num_list)])
+    matrix = matrix[1:]
+    matrix_w = matrix[:, 0:1]
+    matrix_t = matrix[:, 1:2]
+    matrix_l = matrix[:, 2:3]
+    matrix_h = matrix[:, 3:]
+
+    matrix_lt = np.hstack([matrix_l, matrix_t])
+    matrix_wh = np.hstack([matrix_w, matrix_h])
+    matrix_rb = matrix_lt + matrix_wh
+    return matrix_lt, matrix_rb
+
+
+def group_to_coordinate(group_list, matrix_lt, matrix_rb):
+    matrix_box_vlist = np.array([0, 0, 0, 0])
+    for element in group_list:
+        if element[0] < element[1]:
+            rb = matrix_rb[element[0]:element[1]].max(axis=0)
+            lt = matrix_lt[element[0]:element[1]].min(axis=0)
+            matrix_box = np.hstack([lt, rb])
+            matrix_box_vlist = np.vstack([matrix_box_vlist, matrix_box])
+    matrix_box_vlist = matrix_box_vlist[1:]
+    return matrix_box_vlist.tolist()
+
+
+def get_exam_box(img_raw_name, img_list, save_path, subject, access_token):
+    status = 1
+    error_info = ''
+    box_list = []
+    words_list_all = []
+    group_list_all = []
+    try:
+        for img_part in img_list:
+            x_bias = img_part['x_bias']
+            y_bias = img_part['y_bias']
+            img = img_part['img_part']
+
+            words_list, matrix_lt, matrix_rb = get_ocr_text_and_coordinate(access_token, img)
+
+            matrix_lt = matrix_lt + np.asarray([x_bias, y_bias])
+            matrix_rb = matrix_rb + np.asarray([x_bias, y_bias])
+
+            group_list = group_text(words_list, subject)
+            part_box_list = group_to_coordinate(group_list, matrix_lt, matrix_rb)
+            box_list = box_list + part_box_list
+
+            words_list.append('********************************')
+            words_list_all = words_list_all + words_list
+            group_list_all.append(group_list)
+        try:
+            txt_backup_path = save_path.replace('.jpg', '.txt')
+            words_list = [line + ',\n' for line in words_list_all]
+            with open(txt_backup_path, 'w', encoding='utf-8') as writer:
+                writer.writelines('subject:' + subject + '\n')
+                writer.writelines('[\n')
+                writer.writelines(words_list)
+                writer.writelines(']\n')
+                writer.writelines(str(group_list_all))
+            logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
+        except Exception as e:
+            logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
+
+        # 记录xml坐标信息
+        tree = ET.parse(r'./segment/exam_info/000000-template.xml')  # xml tree
+        for index_num, exam_bbox in enumerate(box_list):
+            tree = create_xml('{:02d}'.format(index_num), tree,
+                              exam_bbox[0], exam_bbox[1], exam_bbox[2], exam_bbox[3])
+        # print(exam_items_bbox)
+        tree.write(save_path.replace('.jpg', '.xml'))
+
+    except Exception as e:
+        logger.error('{}试卷: {} 坐标生成失败: {}'.format(subject, img_raw_name, e), exc_info=True)
+        status = 0
+        error_info = error_info + str(e)
+
+    info = {'img_name': img_raw_name, 'coordinate': box_list}
+    if error_info:
+        info = {'img_name': img_raw_name, 'coordinate': box_list, 'error': error_info}
+    logger.info('{} done'.format(img_raw_name))
+    return status, info
+
+
+def get_exam_ocr(img_raw_name, img_list, save_path, subject, access_token):
+    status = 1
+    error_info = ''
+    words_list = []
+
+    for img_part in img_list:
+        img = img_part['img_part']
+        try:
+            part_words_list = get_ocr_text(access_token, img, subject)
+        except Exception as e:
+            part_words_list = []
+            error_info = error_info + str(e)
+        words_list = words_list + part_words_list
+
+    if len(words_list) < 1:
+        logger.error('{}试卷: {} OCR解析失败: {}'.format(subject, img_raw_name, error_info), exc_info=True)
+        status = 0
+
+    else:
+        try:
+            txt_backup_path = save_path.replace('.jpg', '.txt')
+            words_list = [line + '\n' for line in words_list]
+            # # words_list.append(group_list)
+            with open(txt_backup_path, 'w', encoding='utf-8') as writer:
+                writer.writelines('subject:' + subject + '\n')
+                writer.writelines('[\n')
+                writer.writelines(words_list)
+                writer.writelines(']\n')
+
+            logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
+        except Exception as e:
+            logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
+
+    info = {'img_name': img_raw_name, 'text': words_list}
+    if error_info:
+        info = {'img_name': img_raw_name, 'text': words_list, 'error': error_info}
+    logger.info('{} done'.format(img_raw_name))
+    return status, info
+
+
+def get_exam_ocr_single(img_raw_name, img, save_path, subject, access_token):
+    status = 1
+    error_info = ''
+    words_list = []
+
+    try:
+        part_words_list = get_ocr_text(access_token, img)
+    except Exception as e:
+        part_words_list = []
+        error_info = error_info + str(e)
+    words_list = words_list + part_words_list
+
+    if len(words_list) < 1:
+        logger.error('{}试卷: {} OCR解析失败: {}'.format(subject, img_raw_name, error_info), exc_info=True)
+        status = 0
+
+    else:
+        try:
+            txt_backup_path = save_path.replace('.jpg', '.txt')
+            words_list = [line + ',\n' for line in words_list]
+            # # words_list.append(group_list)
+            with open(txt_backup_path, 'w', encoding='utf-8') as writer:
+                writer.writelines('subject:' + subject + '\n')
+                writer.writelines('[\n')
+                writer.writelines(words_list)
+                writer.writelines(']\n')
+
+            logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
+        except Exception as e:
+            logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
+
+    info = {'img_name': img_raw_name, 'text': words_list}
+    if error_info:
+        info = {'img_name': img_raw_name, 'text': words_list, 'error': error_info}
+    logger.info('{} done'.format(img_raw_name))
+    return status, info
+
+
+def get_segment_by_ocr_once(opencv_img, token, subject, save_path, img_raw_name):
+    img = opencv2base64(opencv_img)
+    resp = get_ocr_text_and_coordinate_in_raw_format(token, img)
+    if len(opencv_img.shape) == 3:
+        opencv_img = cv2.cvtColor(opencv_img, cv2.COLOR_BGR2GRAY)
+    test_list = get_page_text(resp['words_result'], opencv_img)
+
+    status = 1
+    error_info = ''
+    box_list = []
+    words_list_all = []
+    group_list_all = []
+    try:
+        for one_page_text in test_list:
+            words_list = [word.get('words') for word in one_page_text]
+            matrix_lt, matrix_rb = resolve_json(one_page_text)
+
+            group_list = group_text(words_list, subject)
+            part_box_list = group_to_coordinate(group_list, matrix_lt, matrix_rb)
+            box_list = box_list + part_box_list
+
+            words_list.append('********************************')
+            words_list_all = words_list_all + words_list
+            group_list_all.append(group_list)
+        try:
+            txt_backup_path = save_path.replace('.jpg', '.txt')
+            words_list = [line + '\n' for line in words_list_all]
+            with open(txt_backup_path, 'w', encoding='utf-8') as writer:
+                writer.writelines('subject:' + subject + '\n')
+                writer.writelines('[\n')
+                writer.writelines(words_list)
+                writer.writelines(']\n')
+                writer.writelines(str(group_list_all))
+            logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
+        except Exception as e:
+            logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
+
+        # 记录xml坐标信息
+        tree = ET.parse(r'./segment/exam_info/000000-template.xml')  # xml tree
+        for index_num, exam_bbox in enumerate(box_list):
+            tree = create_xml('{:02d}'.format(index_num), tree,
+                              exam_bbox[0], exam_bbox[1], exam_bbox[2], exam_bbox[3])
+        # print(exam_items_bbox)
+        tree.write(save_path.replace('.jpg', '.xml'))
+
+    except Exception as e:
+        logger.error('{}试卷: {} 坐标生成失败: {}'.format(subject, img_raw_name, e), exc_info=True)
+        status = 0
+        error_info = error_info + str(e)
+
+    info = {'img_name': img_raw_name, 'coordinate': box_list}
+    if error_info:
+        info = {'img_name': img_raw_name, 'coordinate': box_list, 'error': error_info}
+    logger.info('{} done'.format(img_raw_name))
+    return status, info
+
+
+# opencv_img, token, subject, save_path, img_raw_name
+def get_exam_ocr_once(opencv_img, token, subject, save_path, img_raw_name):
+    img = opencv2base64(opencv_img)
+    resp = get_ocr_text_and_coordinate_in_raw_format(token, img)
+    if len(opencv_img.shape) == 3:
+        opencv_img = cv2.cvtColor(opencv_img, cv2.COLOR_BGR2GRAY)
+    test_list = get_page_text(resp['words_result'], opencv_img)
+
+    words_list = []
+    for one_page_raw_text in test_list:
+        one_page_words_list = [word.get('words') for word in one_page_raw_text]
+        words_list = words_list + one_page_words_list
+
+    status = 1
+    error_info = ''
+
+    if len(words_list) < 1:
+        logger.error('{}试卷: {} OCR解析失败: {}'.format(subject, img_raw_name, error_info), exc_info=True)
+        status = 0
+
+    else:
+        try:
+            txt_backup_path = save_path.replace('.jpg', '.txt')
+            words_list = [line + '\n' for line in words_list]
+            # # words_list.append(group_list)
+            with open(txt_backup_path, 'w', encoding='utf-8') as writer:
+                writer.writelines('subject:' + subject + '\n')
+                writer.writelines('[\n')
+                writer.writelines(words_list)
+                writer.writelines(']\n')
+
+            logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
+        except Exception as e:
+            logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
+
+    info = {'img_name': img_raw_name, 'text': words_list}
+    if error_info:
+        info = {'img_name': img_raw_name, 'text': words_list, 'error': error_info}
+    logger.info('{} done'.format(img_raw_name))
+    return status, info
+
+
+def save_pdf_image(pdf_file, subject, time_str):
+    name = pdf_file.name[:-4]
+    ext0 = pdf_file.name.split('.')[-1]
+
+    raw_name0 = ''.join([''.join(i) for i in pypinyin.pinyin(name, style=pypinyin.NORMAL)])
+    save_dir0 = os.sep.join(
+        [settings.MEDIA_ROOT, 'ocr', subject, time_str, raw_name0 + '_{}'.format(uuid.uuid4().hex[:10])])
+    if not os.path.exists(save_dir0):
+        os.makedirs(save_dir0)
+    pdf_path = os.sep.join([save_dir0, raw_name0 + '.' + ext0])
+    with open(pdf_path, 'wb') as pdfFileObj:
+        for chunk in pdf_file.chunks():
+            pdfFileObj.write(chunk)
+    images_list = convert_from_path(pdf_path, dpi=200, output_folder=save_dir0,
+                                    output_file='image',
+                                    first_page=None, last_page=None, fmt='JPEG')
+    upload_img_path_list = glob.glob(os.sep.join([save_dir0, '*.jpg']))
+    try:
+        images_list = [cv2.cvtColor(np.asarray(ele), cv2.COLOR_RGB2BGR) for ele in images_list]
+    except Exception:
+        images_list = [np.asarray(ele) for ele in images_list]
+    return upload_img_path_list, images_list
+
+
+def save_raw_image_without_segment_pdf(subject, datetime, raw_name, img_file, analysis_type):
+    # 随机生成新的图片名,自定义路径。
+    file_name = '{}_{}.{}'.format(raw_name, uuid.uuid4().hex[:10], 'jpg')
+    raw_img = Image.open(img_file)  # 读取上传的网络图像
+    save_dir = os.path.join(settings.MEDIA_ROOT, analysis_type, subject, datetime)
+
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    save_path = os.path.join(save_dir, file_name)
+
+    channels = raw_img.split()
+    if len(channels) > 3:
+        img = Image.merge("RGB", (channels[1], channels[2], channels[3]))
+        open_cv_image = np.array(img)
+        resize_img = resize_by_percent(open_cv_image, 0.5)
+
+    else:
+        img = raw_img
+        open_cv_image = np.array(img)
+        resize_img = resize_by_percent(open_cv_image, 0.5)
+
+    try:
+        img.save(save_path)
+        # write_single_img(resize_img, save_path.replace('.jpg', '_small.jpg'))
+    except Exception as e:
+        raise e
+
+    url_path = os.path.join(settings.MEDIA_URL, analysis_type, subject, datetime, file_name).replace('\\', '/')
+    return save_path, url_path, open_cv_image
+
+
+def get_exam_ocr_by_penguin(img_raw_name, raw_image, size, save_path, subject):
+    status = 1
+    error_info = ''
+    words_list = []
+
+    try:
+        words_list = get_ocr_english_text(raw_image, size)
+    except Exception as e:
+        error_info = error_info + str(e)
+
+    if len(words_list) < 1:
+        logger.error('{}试卷: {} OCR解析失败: {}'.format(subject, img_raw_name, error_info), exc_info=True)
+        status = 0
+
+    else:
+        try:
+            txt_backup_path = save_path.replace('.jpg', '.txt')
+            words_list = [line + '\n' for line in words_list]
+            # # words_list.append(group_list)
+            with open(txt_backup_path, 'w', encoding='utf-8') as writer:
+                writer.writelines('subject:' + subject + '\n')
+                writer.writelines('[\n')
+                writer.writelines(words_list)
+                writer.writelines(']\n')
+
+            logger.info('{}试卷: {} 文本信息保存成功'.format(subject, img_raw_name))
+        except Exception as e:
+            logger.error('{}试卷: {} 文本信息保存失败: {}'.format(subject, img_raw_name, e), exc_info=True)
+
+    info = {'img_name': img_raw_name, 'text': words_list}
+    if error_info:
+        info = {'img_name': img_raw_name, 'text': words_list, 'error': error_info}
+    logger.info('{} done'.format(img_raw_name))
+    return status, info

+ 3 - 0
segment/sheet_resolve/__init__.py

@@ -0,0 +1,3 @@
+# @Author  : lightXu
+# @File    : __init__.py.py
+# @Time    : 2018/12/19 0019 下午 14:10

+ 3 - 0
segment/sheet_resolve/analysis/__init__.py

@@ -0,0 +1,3 @@
+# @Author  : lightXu
+# @File    : __init__.py.py
+# @Time    : 2018/11/21 0021 下午 16:01

+ 3 - 0
segment/sheet_resolve/analysis/anchor/__init__.py

@@ -0,0 +1,3 @@
+# @Author  : lightXu
+# @File    : __init__.py.py
+# @Time    : 2019/9/16 0016 下午 14:37

+ 644 - 0
segment/sheet_resolve/analysis/anchor/marker_detection.py

@@ -0,0 +1,644 @@
+import glob
+import os
+import math
+from .util import *
+import ctypes
+import time
+import sys
+import numpy as np
+try:
+    temp = ctypes.windll.LoadLibrary('opencv_ffmpeg410_64.dll')
+except:
+    pass
+
+
+def rotate_by_anchor(image, method='connected', debug=0):
+    #   寻找试卷大定位点并据此旋转纠偏
+
+    shift_threshold = 80   # 50    # 最上方两个定位点纵坐标相差阈值
+    pie = 3.14159
+    # height, width = image.shape[:2]
+    height, width = image.shape[:2]
+    # h_ratio = (0.1, 0.9)
+    area_threshold = 0.28   # 0.25
+    anchors_len_threshold = 2
+    # shape_para = {'height': (80, 10), 'w2h': (3, 1.1), 'area': (6000, 500), 'area_ratio': 0.5}
+
+    binary = pre_process_for_anchors(image, debug=0)
+    # h0 = int(binary.shape[0] * h_ratio[0])
+    # h1 = int(binary.shape[0] * h_ratio[1])
+    # binary[h0:h1, :] = 0
+    binary = extract_feature(binary, method=4, debug=0)
+    boxes = find_boxes(binary, method=method, debug=0)
+    markers = find_marker_by_shape(boxes, debug=0)
+    # 按面积从大到小排列
+    markers.sort(reverse=True, key=lambda x: x[-1])
+    anchors = []
+    for i in range(len(markers)):
+        anchors = []
+        anchors.append(markers[i])
+        for j in range(i+1, len(markers)):
+            if (anchors[0][-1] - markers[j][-1]) / anchors[0][-1] <= area_threshold:
+                anchors.append(markers[j])
+        if len(anchors) >= anchors_len_threshold:
+            break
+
+    anchors.sort(key=lambda x: x[4][1])
+    top_anchors = []
+    bottom_anchors = []
+    for a in anchors:
+        if a[4][1] - anchors[0][4][1] < shift_threshold:
+            top_anchors.append(a)
+        else:
+            bottom_anchors.append(a)
+
+    # draw_box(image, top_anchors, (0, 255, 255))
+    # draw_box(image, bottom_anchors, (255, 0, 255))
+    # plt.figure(figsize=(15, 10))
+    # plt.title(method)
+    # plt.imshow(image)
+    # plt.show()
+
+    #   旋转纠偏
+    if len(top_anchors) >= 2:
+        mean_y = sum([x[4][1] for x in top_anchors]) / len(top_anchors)
+        top_anchors.sort(key=lambda x: abs(x[4][1] - mean_y))
+        angle = 180 / pie * math.atan((top_anchors[1][4][1] - top_anchors[0][4][1]) / (top_anchors[1][4][0] -
+                                                                                       top_anchors[0][4][0]))
+    elif len(bottom_anchors) >= 2:
+        mean_y = sum([x[4][1] for x in bottom_anchors]) / len(bottom_anchors)
+        bottom_anchors.sort(key=lambda x: abs(x[4][1] - mean_y))
+        angle = 180 / pie * math.atan((bottom_anchors[1][4][1] - bottom_anchors[0][4][1]) / (bottom_anchors[1][4][0] -
+                                                                                             bottom_anchors[0][4][0]))
+    else:
+        return image, False
+    # try:
+    #     angle = 180 / pie * math.atan((top_anchors[1][4][1] - top_anchors[0][4][1]) / (top_anchors[1][4][0] -
+    #                                                                                    top_anchors[0][4][0]))
+    # except IndexError:
+    #     print('Not Enough top_anchors! Proceed any way!')
+    #     return image, 0
+    (cx, cy) = (width // 2, height // 2)
+    mat = cv2.getRotationMatrix2D((cx, cy), angle, 1.0)
+    # compute the new bounding dimensions of the image
+    cos = np.abs(mat[0, 0])
+    sin = np.abs(mat[0, 1])
+    new_width = int((height * sin) + (width * cos))
+    new_height = int((height * cos) + (width * sin))
+    # adjust the rotation matrix to take into account translation
+    mat[0, 2] += (new_width / 2) - cx
+    mat[1, 2] += (new_height / 2) - cy
+
+    rot_image = cv2.warpAffine(image, mat, (new_width, new_height), borderValue=(255, 255, 255))
+
+    if debug == 1:
+        draw_box(image, top_anchors, (0, 255, 255))
+        draw_box(image, bottom_anchors, (255, 0, 255))
+        plt.figure(figsize=(15, 10))
+        plt.title(method)
+        plt.imshow(image)
+        plt.show()
+
+    return rot_image, True
+
+
+def detect_anchor_by_position(anchors, markers, image, method, debug=0):
+    #   按默认位置寻找第一个及最后一个定位点
+    position_threshold = 300
+    h_ratio = (0.1, 0.9)
+    shift_shreshold = 50
+
+    height, width = image.shape[:2]
+    first_anchor, last_anchor = [], []
+    if method == 't':
+        top_left_flag, top_right_flag = height*h_ratio[0] + position_threshold, height*h_ratio[0] - position_threshold
+
+        if len(anchors) == 0:
+            for m in markers:
+                if m[4][0] < position_threshold and m[4][1] + m[4][0] < top_left_flag:
+                    top_left_flag = m[4][1] + m[4][0]
+                    first_anchor = m
+                elif width - m[4][0] < position_threshold and m[4][1] - m[4][0] < top_right_flag:
+                    top_right_flag = m[4][1] - m[4][0]
+                    last_anchor = m
+            if len(first_anchor) > 0:
+                anchors.append(first_anchor)
+            if len(last_anchor) > 0:
+                anchors.append(last_anchor)
+        else:
+            if anchors[0][4][0] > position_threshold:
+                for m in markers:
+                    if m[4][0] < position_threshold and m[4][1] + m[4][0] < top_left_flag:
+                        top_left_flag = m[4][1] + m[4][0]
+                        first_anchor = m
+                if len(first_anchor) > 0:
+                    anchors.insert(0, first_anchor)
+            if width - anchors[-1][4][0] > position_threshold:
+                for m in markers:
+                    if width - m[4][0] < position_threshold and m[4][1] - m[4][0] < top_right_flag:
+                        top_right_flag = m[4][1] - m[4][0]
+                        last_anchor = m
+                if len(last_anchor) > 0:
+                    anchors.append(last_anchor)
+    if method == 'b':
+        bottom_left_flag, bottom_right_flag = height*h_ratio[1]-position_threshold, height*h_ratio[1]+position_threshold
+
+        if len(anchors) == 0:
+            for m in markers:
+                if m[4][0] < position_threshold and m[4][1] - m[4][0] > bottom_left_flag:
+                    bottom_left_flag = m[4][1] - m[4][0]
+                    first_anchor = m
+                elif width - m[4][0] < position_threshold and m[4][1] + m[4][0] > bottom_right_flag:
+                    bottom_right_flag = m[4][1] + m[4][0]
+                    last_anchor = m
+            if len(first_anchor) > 0:
+                anchors.append(first_anchor)
+            if len(last_anchor) > 0:
+                anchors.append(last_anchor)
+        else:
+            if anchors[0][4][0] > position_threshold:
+                for m in markers:
+                    if m[4][0] < position_threshold and m[4][1] - m[4][0] > bottom_left_flag:
+                        bottom_left_flag = m[4][1] - m[4][0]
+                        first_anchor = m
+                if len(first_anchor) > 0:
+                    anchors.insert(0, first_anchor)
+            if width - anchors[-1][4][0] > position_threshold:
+                for m in markers:
+                    if width - m[4][0] < position_threshold and m[4][1] + m[4][0] > bottom_right_flag:
+                        bottom_right_flag = m[4][1] + m[4][0]
+                        last_anchor = m
+                if len(last_anchor) > 0:
+                    anchors.append(last_anchor)
+
+    if debug == 1:
+        draw_box(image, [first_anchor, last_anchor], debug=1)
+        plt.figure(figsize=(15, 10))
+        plt.title('anchor by position')
+        plt.imshow(image)
+        plt.show()
+
+
+def detect_anchor_public(image, method='connected', debug=0):
+    #   寻找第三方试卷最上方及最下方的定位点
+    shift_threshold = 50  # 80
+    height, width = image.shape[:2]
+    h0, h1 = 0.1, 0.9
+    pos = (0.1, 0.3, 0.5, 0.6, 0.8, 0.9)
+    pos_threshold = 0.1
+    area_threshold = 0.28  # 0.25   # 大定位点面积差阈值
+    anchors_len_threshold = 2
+    shape_para = {'height': (80, 10), 'w2h': (3, 0.6), 'area': (6000, 500), 'area_ratio': 0.9}
+    blur_size = 3
+    sigma = 5
+    shift_edge = 2  # 对定位点位置做微调偏移
+
+    binary = pre_process_for_anchors(image, debug=0, blur_size=blur_size, sigma=sigma)
+    binary = extract_feature(binary, method=4, debug=0)
+    boxes = find_boxes(binary, method=method, debug=0)
+    markers = find_marker_by_shape(boxes, shape_para=shape_para, debug=0)
+    marker_list = collect_markers_by_position(markers, method='h', shift_threshold=shift_threshold)
+
+    if len(marker_list) == 0:
+        anchors = []
+    elif len(marker_list) == 1:
+        anchors = marker_list[0]
+    else:
+        anchors = marker_list[0]
+        anchors.extend(marker_list[-1])
+    anchors = [[a[0]-shift_edge, a[1]-shift_edge, a[2]-shift_edge-1, a[3]-shift_edge-1] for a in anchors]
+
+    if debug == 1:
+        # print(anchors)
+        draw_box(image, anchors, (0, 255, 255), debug=1)
+        plt.figure(figsize=(15, 10))
+        plt.title(method)
+        plt.imshow(image, cmap='gray')
+        plt.show()
+    elif debug == 2:
+        markers.sort(reverse=True, key=lambda x: x[4][1])
+        draw_box(image, markers, debug=1)
+        plt.figure(figsize=(15, 10))
+        plt.title('by shape')
+        plt.imshow(image, cmap='gray')
+        plt.show()
+
+    return anchors
+
+
+def detect_anchor(image, method='connected', debug=0):
+    #   寻找试卷最上方及最下方的定位点
+
+    shift_threshold = 50    # 80
+    height, width = image.shape[:2]
+    h0, h1 = 0.1, 0.9
+    pos = (0.1, 0.3, 0.5, 0.6, 0.8, 0.9)
+    pos_threshold = 0.1
+    area_threshold = 0.28   # 0.25   # 大定位点面积差阈值
+    anchors_len_threshold = 2
+
+    binary = pre_process_for_anchors(image, debug=0)
+    binary = extract_feature(binary, method=4, debug=0)
+    boxes = find_boxes(binary, method=method, debug=0)
+    markers = find_marker_by_shape(boxes, debug=0)
+    marker_list = collect_markers_by_position(markers, method='h', shift_threshold=shift_threshold)
+    # print(len(marker_list[0]))
+    # print(len(marker_list[-1]))
+    if len(marker_list) == 0:
+        return []
+
+    #   如果上方定位点不多于下方定位点,上下左右翻转答题卡
+    if len(marker_list[0]) < len(marker_list[-1]) and len(marker_list[-1]) >= 6:
+        image = cv2.flip(image, -1)
+        binary = cv2.flip(binary, -1)
+        boxes = find_boxes(binary, method=method, debug=0)
+        markers = find_marker_by_shape(boxes, debug=0)
+        marker_list = collect_markers_by_position(markers, method='h', shift_threshold=shift_threshold)
+
+    # for m in marker_list:
+    #     m.sort(key=lambda x: x[4][0])
+    top_anchors = []
+    bottom_anchors = []
+    version_points = []
+    top_y = np.mean(np.array([m[4][1] for m in marker_list[0]]))
+    bottom_y = np.mean(np.array([m[4][1] for m in marker_list[-1]]))
+    top_index, bottom_index = 0, -1
+
+    try:
+        bottom_anchors.append(min([m for m in marker_list[bottom_index] if m[4][0] / width < pos[0]],
+                                  key=lambda x: abs(x[4][1] - bottom_y)))
+    except ValueError:
+        pass
+    try:
+        bottom_anchors.append(min([m for m in marker_list[bottom_index] if m[4][0] / width > pos[-1]],
+                                  key=lambda x: abs(x[4][1] - bottom_y)))
+    except ValueError:
+        pass
+
+    top_list = [m for m in marker_list[top_index] if m[4][0] / width < pos[0]]
+    if len(top_list) > 0:
+        top_anchors.append(min(top_list, key=lambda x: abs(x[4][1] - top_y)))
+    top_list = [m for m in marker_list[top_index] if abs(m[4][0] / width - pos[1]) < pos_threshold]
+    if len(top_list) > 0:
+        top_anchors.append(min(top_list, key=lambda x: abs(x[4][1] - top_y)))
+        top_list = [m for m in marker_list[top_index] if abs(m[4][0] / width - pos[3]) < pos_threshold]
+        if len(top_list) > 0:
+            top_anchors.append(min(top_list, key=lambda x: abs(x[4][1] - top_y)))
+    else:
+        top_list = [m for m in marker_list[top_index] if abs(m[4][0] / width - pos[2]) < pos_threshold]
+        if len(top_list) > 0:
+            top_anchors.append(min(top_list, key=lambda x: abs(x[4][1] - top_y)))
+        else:
+            top_list = [m for m in marker_list[top_index] if abs(m[4][0] / width - pos[3]) < pos_threshold]
+            if len(top_list) > 0:
+                top_anchors.append(min(top_list, key=lambda x: abs(x[4][1] - top_y)))
+
+    try:
+        top_anchors.append(min([m for m in marker_list[top_index] if m[4][0] / width > pos[-1]],
+                               key=lambda x: abs(x[4][1] - top_y) - x[4][0]))
+    except ValueError:
+        pass
+    # for m in marker_list[-1]:
+    #     if m[4][0] / width < pos[0]:
+    #         bottom_anchors[0] = m
+    #     elif m[4][0] / width > pos[-1]:
+    #         bottom_anchors[1] = m
+    # for m in marker_list[0]:
+    #     if m[4][0] / width > pos[-1]:
+    #         top_anchors[-1] = m
+    #     elif abs(m[4][0] / width - pos[2]) < pos_threshold:
+    #         top_anchors[1] = m
+    #         top_anchors.pop(2)
+    #     elif abs(m[4][0] / width - pos[1]) < pos_threshold:
+    #         top_anchors[1] = m
+    #     elif abs(m[4][0] / width - pos[3]) < pos_threshold:
+    #         top_anchors[2] = m
+
+    # for i in range(1, len(marker_list[0])+1):
+    #     if marker_list[0][len(marker_list[0])-i][4][0] / width < pos[0]:
+    #         top_anchors[0] = marker_list[0][-i]
+    #         break
+
+
+    # # 按面积从大到小排列
+    # markers.sort(reverse=True, key=lambda x: x[-1])
+    # flag = 0
+    # anchors = []
+    # for i in range(len(markers)):
+    #     anchors = []
+    #     anchors.append(markers[i])
+    #     for j in range(i+1, len(markers)):
+    #         if (anchors[0][-1] - markers[j][-1]) / anchors[0][-1] <= area_threshold:
+    #             anchors.append(markers[j])
+    #             flag = j
+    #     if len(anchors) >= anchors_len_threshold:
+    #         break
+    # if len(anchors) == 0:
+    #     return [[], [], []]
+    # anchors.sort(key=lambda x: x[4][1])
+    # # print('anchors\n')
+    # # print(anchors)
+    # top_anchors = []
+    # bottom_anchors = []
+    # for a in anchors:
+    #     if a[4][1] - anchors[0][4][1] < shift_threshold:
+    #         top_anchors.append(a)
+    #     elif anchors[-1][4][1] - a[4][1] < shift_threshold:
+    #         bottom_anchors.append(a)
+    # top_anchors.sort(key=lambda x: x[4][0])
+    # bottom_anchors.sort(key=lambda x: x[4][0])
+    # detect_anchor_by_position(top_anchors, markers, image, method='t', debug=0)
+    # detect_anchor_by_position(bottom_anchors, markers, image, method='b', debug=0)
+    #
+    # version_points = []
+    # for i in range(flag + 1, len(markers)):
+    #     if abs(markers[i][4][1] - anchors[0][4][1]) <= shift_threshold and markers[i][4][0] > width / 2:
+    #         version_points.append(markers[i])
+
+    if debug == 1:
+        draw_box(image, top_anchors, (0, 255, 255), debug=1)
+        draw_box(image, bottom_anchors, (255, 0, 255), debug=1)
+        draw_box(image, version_points, (255, 255, 0), debug=1)
+        plt.figure(figsize=(15, 10))
+        plt.title(method)
+        plt.imshow(image)
+        plt.show()
+    elif debug == 2:
+        markers.sort(reverse=True, key=lambda x: x[4][1])
+        draw_box(image, markers, debug=1)
+        plt.figure(figsize=(15, 10))
+        plt.title('by shape')
+        plt.imshow(image)
+        plt.show()
+    elif debug == 3:
+        colors = ((0, 255, 255), (255, 0, 255), (255, 255, 0))
+        if len(marker_list) > 0:
+            c = 0
+            for p in marker_list:
+                draw_box(image, p, color=colors[c])
+                c = (c + 1) % 3
+            plt.figure(figsize=(15, 10))
+            plt.title('marker list')
+            plt.imshow(image)
+            plt.show()
+
+    return [[top_anchors, bottom_anchors, version_points], image]
+
+
+def detect_problem_marker(image, anchors, method='connected', column_num=2, debug=0):
+    #   寻找题目定位点
+    double_page_width_ratio = 0.42      # 默认双栏宽度比例
+    three_page_width_ratio = 0.29       # 默认三栏宽度比例
+    double_page_separation = 250        # 默认双栏栏间间距
+    three_page_separation = 100         # 默认三栏栏间间距
+    horizontal_threshold = 100           # 单栏宽度比例阈值
+    shift_threshold = 80    # 50
+    col_threshold = 100
+    blank1 = 20
+    blank2 = 100
+    # shape_para = {'height': (80, 10), 'w2h': (3, 0.5), 'area': (6000, 500), 'area_ratio': 0.5}
+    colors = ((0, 255, 255), (255, 0, 255), (255, 255, 0))
+    height, width = image.shape[:2]
+    remove_iteration = 1                # 去除异常值循环次数
+
+    #   确定单栏宽度及每栏定位
+    page_width, column_num, column_pos = find_column(anchors, width, column_num, debug=0)
+
+    #   清除答题卡上下方定位点
+    top_anchors, bottom_anchors = anchors[:2]
+    if len(top_anchors) > 0:
+        blank_top = max(top_anchors, key=lambda x: x[3])[3] + blank1
+    else:
+        blank_top = blank2
+    if len(bottom_anchors) > 0:
+        blank_bottom = min(bottom_anchors, key=lambda x: x[1])[1] - blank1
+    else:
+        blank_bottom = height - blank2
+    binary = pre_process(image, blank_top, blank_bottom, debug=0)
+
+    #   寻找所有可能的题目定位点
+    binary = extract_feature(binary, method=3, debug=0)
+    boxes = find_boxes(binary, method=method, debug=0)
+    marker_candidates = find_marker_by_shape(boxes, debug=0)
+    marker_candidates.sort(key=lambda x: x[4][0])
+
+    # #   寻找配对点
+    # pair_list = []
+    # pair_list_indexes = []
+    # for i in range(len(marker_candidates)):
+    #     if i not in pair_list_indexes:
+    #         p, p_index = find_pair(marker_candidates[i], marker_candidates, page_width, horizontal_threshold)
+    #         if p_index >= 0:
+    #             pair_list.append([marker_candidates[i], p])
+    #             pair_list_indexes.append(p_index)
+    # if len(pair_list) > 0:
+    #     pair_list.sort(key=lambda x: x[0][4][0])
+    #
+    # #   按栏寻找题目定位点
+    # if column_num == 2:
+    #     problem_markers = [[], []]      # 题目定位点分两栏排列
+    #     for p in pair_list:
+    #         if abs(p[0][4][0] - column_pos[0]) < shift_threshold:
+    #             problem_markers[0].extend(p)
+    #         elif abs(p[0][4][0] - column_pos[1]) < shift_threshold:
+    #             problem_markers[1].extend(p)
+    #
+    # elif column_num == 3:
+    #     problem_markers = [[], [], []]  # 题目定位点分三栏排列
+    #     for p in pair_list:
+    #         if abs(p[0][4][0] - column_pos[0]) < shift_threshold:
+    #             problem_markers[0].extend(p)
+    #         elif abs(p[0][4][0] - column_pos[1]) < shift_threshold:
+    #             problem_markers[2].extend(p)
+    #         elif abs(p[0][4][0] - column_pos[0] - page_width - three_page_separation) < shift_threshold:
+    #             problem_markers[1].extend(p)
+    #
+    # #   剔除异常mark
+    # for i in range(len(problem_markers)):
+    #     problem_markers[i] = remove_abnormal_marker(problem_markers[i], debug=0)
+
+    #   将定位点按垂直位置排列
+    marker_list = collect_markers_by_position(marker_candidates, method='v', debug=0)
+    if column_num == 2:
+        problem_markers = [[], []]
+    elif column_num == 3:
+        problem_markers = [[], [], []]
+    if len(marker_list) == 0:
+        return problem_markers
+
+    #   将题目定位点配对
+    # for col in marker_list:
+    #     col.sort(key=lambda x: x[4][1])
+    mid_left_col_pos, mid_right_col_pos = column_pos[0] + page_width, column_pos[1] - page_width
+    mid_left_col_indexes, mid_right_col_indexes = [], []
+    for col_index, col in enumerate(marker_list):
+        if abs(col[0][4][0] - column_pos[0]) < col_threshold:
+            pair_list_index = find_pair_list(col, marker_list, page_width, col_threshold)[1]
+            if pair_list_index >= 0:
+                for c in col:
+                    pair, pair_index, distance = find_pair(c, marker_list[pair_list_index], page_width, col_threshold)
+                    if pair_index >= 0:
+                        problem_markers[0].extend([c, pair])
+                        mid_left_col_indexes.append(pair_list_index)
+                        # print('distance', col_index, distance)
+                        if pair[4][0] < mid_left_col_pos:
+                            mid_left_col_pos = pair[4][0]
+        elif abs(col[0][4][0] - column_pos[1]) < col_threshold:
+            pair_list_index = find_pair_list(col, marker_list, page_width, col_threshold)[1]
+            if pair_list_index >= 0:
+                for c in col:
+                    pair, pair_index, distance = find_pair(c, marker_list[pair_list_index], page_width, col_threshold)
+                    if pair_index >= 0:
+                        problem_markers[-1].extend([c, pair])
+                        mid_right_col_indexes.append(col_index)
+                        # print('distance', col_index, distance)
+                        if c[4][0] - page_width > mid_right_col_pos:
+                            mid_right_col_pos = c[4][0] - page_width
+    mid_left_col_indexes = set(mid_left_col_indexes)
+    mid_right_col_indexes = set(mid_right_col_indexes)
+    # print(mid_left_col_indexes, mid_left_col_pos)
+    # print(mid_right_col_indexes, mid_right_col_pos)
+    if column_num == 3:
+        for col_index, col in enumerate(marker_list):
+            if mid_left_col_pos < col[0][4][0] < mid_right_col_pos and (col_index not in mid_left_col_indexes):
+                pair_list_index = find_pair_list(col, marker_list, page_width, col_threshold)[1]
+                if pair_list_index not in mid_right_col_indexes:
+                    for c in col:
+                        pair, pair_index, distance = find_pair(c, marker_list[pair_list_index], page_width,
+                                                               col_threshold)
+                        if pair_index >= 0:
+                            problem_markers[1].extend([c, pair])
+                            # print('distance', col_index, distance)
+
+    for index, p in enumerate(problem_markers):
+        temp = [[p[2*i], p[2*i+1]] for i in range(len(p)//2)]
+        temp.sort(key=lambda x: x[0][4][1])
+        problem_markers[index] = []
+        for t in temp:
+            problem_markers[index].extend(t)
+    #   剔除异常mark
+    for i in range(len(problem_markers)):
+        for j in range(remove_iteration):
+            problem_markers[i], page_width = remove_abnormal_marker(problem_markers[i], page_width, debug=0)
+    problem_markers = check_with_anchor(problem_markers, top_anchors, page_width, column_num)
+
+    # pair_list = find_pair_list(marker_list[0], marker_list, page_width)
+    # if pair_list:
+    #     page_width = abs(marker_list[0][0][4][0] - pair_list[0][4][0])
+    # abscissa_flag = 0
+    # for prob in marker_list:
+    #     if prob[0][4][0] >= abscissa_flag:
+    #         pair_list = find_pair_list(prob, marker_list, page_width)
+    #         if pair_list:
+    #             for p in prob:
+    #                 pair = find_pair(p, pair_list, page_width)
+    #                 if pair:
+    #                     problem_markers.extend(pair)
+    #                     abscissa_flag = pair[1][4][0] + shift_threshold
+
+    if debug == 1:
+        print(page_width, column_num, column_pos)
+        c = 0
+        for p in problem_markers:
+            print('column:', c)
+            draw_box(image, p, color=colors[c], debug=1)
+            c = (c + 1) % 3
+            t = [(p[2*i+2][4][0]-p[2*i][4][0])/(p[2*i+2][4][1]-p[2*i][4][1]) for i in range(len(p)//2-1)]
+            print('slope', t)
+        # draw_box(image, problem_markers)
+        plt.figure(figsize=(12, 8))
+        plt.title('problem markers')
+        plt.imshow(image)
+        plt.show()
+    elif debug == 2:
+        print(page_width, column_num, column_pos)
+        if len(marker_list) > 0:
+            c = 0
+            for p in marker_list:
+                draw_box(image, p, color=colors[c], debug=1)
+                c = (c + 1) % 3
+            plt.figure(figsize=(15, 10))
+            plt.title('marker list')
+            plt.imshow(image)
+            plt.show()
+
+        # c = 0
+        # for p in pair_list:
+        #     draw_box(image, p, color=colors[c])
+        #     c = (c + 1) % 3
+        # plt.figure(figsize=(15, 10))
+        # plt.title('pair list')
+        # plt.imshow(image)
+        # plt.show()
+    elif debug == 3:
+        draw_box(image, marker_candidates)
+        plt.figure(figsize=(15, 10))
+        plt.title('marker candidates')
+        plt.imshow(image)
+        plt.show()
+    elif debug == 4:
+        draw_box(image, boxes, debug=1)
+        plt.figure(figsize=(15, 10))
+        plt.title('boxes')
+        plt.imshow(image)
+        plt.show()
+
+    return problem_markers
+
+
+def main(img_file, method='connected', debug=0):
+    #   寻找所有定位点
+    colors = ((0, 255, 255), (255, 0, 255), (255, 255, 0))
+    # image = cv2.imread(img_file)
+    image = read_single_img(img_file)
+    # image = Image.open(img_file)
+
+    rot_image, flag = rotate_by_anchor(image, method=method, debug=0)
+    # if not flag:
+    #     return []
+    anchors, rot_image = detect_anchor(rot_image, method=method, debug=0)
+
+    # #   如果上方定位点不多于下方定位点,上下左右翻转答题卡
+    # if len(anchors) > 0:
+    #     if len(anchors[0]) + len(anchors[2]) <= len(anchors[1]):
+    #         rot_image = cv2.flip(rot_image, -1)
+    #         anchors = detect_anchor(rot_image, method=method, debug=0)
+
+    #   寻找题目定位点
+    problem_markers = detect_problem_marker(rot_image, anchors, method=method, debug=0)
+
+    if debug == 1:
+        c = 0
+        for i in range(3):
+            draw_box(rot_image, anchors[i], color=colors[c])
+            c = (c + 1) % 3
+        for p in problem_markers:
+            draw_box(rot_image, p)
+        plt.figure(figsize=(12, 8))
+        plt.title('markers')
+        plt.imshow(rot_image)
+        plt.show()
+    # elif debug == 2:
+    #     path, image_name = os.path.split(img_file)
+    #     out_dir = path + '_output'
+    #     out_img_file = os.path.join(out_dir, image_name)
+    #     if not os.path.exists(out_dir):
+    #         os.makedirs(out_dir)
+    #     cv2.imwrite(out_img_file, rot_image)
+    #     rot_image.save(out_img_file)
+    #     xml_file = out_img_file.replace('.jpg', '.xml')
+    #     create_xml(xml_file, markers)
+
+    return anchors, problem_markers
+
+
+def find_anchor(image, method='connected'):
+    # rot_image, flag = rotate_by_anchor(image, method=method, debug=0)
+    anchors_raw = detect_anchor_public(image, method=method, debug=0)
+    anchors_list = []
+    for anchor in anchors_raw:
+        bbox = {'xmin': int(str(anchor[0])), 'ymin': int(str(anchor[1])),
+                'xmax': int(str(anchor[2])), 'ymax': int(str(anchor[3]))}
+        anchor_dict = {'class_name': 'anchor_point', 'bounding_box': bbox}
+        anchors_list.append(anchor_dict)
+    return anchors_list

+ 624 - 0
segment/sheet_resolve/analysis/anchor/util.py

@@ -0,0 +1,624 @@
+import cv2
+import matplotlib.pylab as plt
+import numpy as np
+
+
+def read_single_img(img_path):
+    try:
+        im = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), -1)
+    except FileNotFoundError as e:
+        raise e
+    return im
+
+
+def pre_process(image, blank_top=20, blank_bottom=-20, blur_size=5, sigma=5, debug=0):
+    #   返回二值逆图
+    blank_size = 20
+
+    if image.ndim == 3:
+        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+    elif image.ndim == 2:
+        gray = image
+    #   裁边
+    gray[0:blank_top, :] = 255
+    gray[blank_bottom:, :] = 255
+    gray[:, 0:blank_size] = 255
+    gray[:, -blank_size:] = 255
+    pre = 255 - gray
+    pre = cv2.GaussianBlur(pre, (blur_size, blur_size), sigma)
+    binary = cv2.threshold(pre, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
+
+    if debug == 1:
+        plt.figure(figsize=(15, 10))
+        plt.subplot(211)
+        plt.title('gray')
+        plt.imshow(gray, cmap='gray')
+        plt.subplot(212)
+        plt.title('binary')
+        plt.imshow(255 - binary, cmap='gray')
+        plt.show()
+
+    return binary
+
+
+def pre_process_for_anchors(image, blank_top=20, blank_bottom=-20, blur_size=5, sigma=10, blank_size=20, debug=0):
+    #   去掉中间内容,返回上下定位点的二值逆图
+
+    h_ratio = (0.1, 0.9)
+    h0 = int(image.shape[0] * h_ratio[0])
+    h1 = int(image.shape[0] * h_ratio[1])
+
+    if image.ndim == 3:
+        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+    elif image.ndim == 2:
+        gray = image.copy()
+
+    #   裁边
+    gray[0:blank_top, :] = 255
+    gray[blank_bottom:, :] = 255
+    gray[:, 0:blank_size] = 255
+    gray[:, -blank_size:] = 255
+    #   去掉中间内容
+    gray[h0:h1, :] = 255
+
+    pre = 255 - gray
+    pre = cv2.GaussianBlur(pre, (blur_size, blur_size), sigma)
+    binary = cv2.threshold(pre, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
+
+    if debug == 1:
+        plt.figure(figsize=(15, 10))
+        plt.subplot(211)
+        plt.title('gray')
+        plt.imshow(gray, cmap='gray')
+        plt.subplot(212)
+        plt.title('binary')
+        plt.imshow(255 - binary, cmap='gray')
+        plt.show()
+
+    return binary
+
+
+def extract_feature(binary, method=4, ker_size1=2, ker_size2=10, debug=0):
+    #   对二值图进一步处理
+    close_size = 3
+    kernel_height = 5
+    kernel_width = 1
+    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (close_size, close_size))
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_height, kernel_width))
+
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (ker_size1, ker_size2))
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (ker_size2, ker_size1))
+
+    if method == 1:
+        # ret = cv2.dilate(binary, kernel)
+        ret = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)
+        ret = cv2.morphologyEx(ret, cv2.MORPH_OPEN, vertical_kernel)
+    elif method == 2:
+        ret = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
+        ret = cv2.morphologyEx(ret, cv2.MORPH_CLOSE, close_kernel)
+    elif method == 3:
+        ret = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel)
+        ret = cv2.morphologyEx(ret, cv2.MORPH_OPEN, vertical_kernel)
+    elif method == 4:
+        ret = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, close_kernel)
+        ret = cv2.morphologyEx(ret, cv2.MORPH_OPEN, horizontal_kernel)
+        ret = cv2.morphologyEx(ret, cv2.MORPH_OPEN, vertical_kernel)
+    else:
+        ret = binary
+    if debug == 1:
+        plt.figure(figsize=(15, 10))
+        plt.subplot(211)
+        plt.title('before feature extraction')
+        plt.imshow(255 - binary, cmap='gray')
+        # plt.show()
+        # plt.figure(figsize=(15, 10))
+        plt.subplot(212)
+        plt.title('after feature extraction')
+        plt.imshow(255 - ret, cmap='gray')
+        plt.show()
+
+    return ret
+
+
+def draw_contour(binary):
+    (major, minor, _) = cv2.__version__.split(".")     # check cv version
+    boxes = []
+    contours = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    contours = contours[1] if major == '3' else contours[0]
+    for i in range(0, len(contours)):
+        xmin, ymin, w, h = cv2.boundingRect(contours[i])
+        xmax = xmin + w
+        ymax = ymin + h
+        centroid = [xmin + w // 2, ymin + h // 2]
+        boxes.append([xmin, ymin, xmax, ymax, centroid, w*h])
+
+    return boxes
+
+
+def draw_connected_component(binary):
+    connectivity = 8
+    boxes = []
+    num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(binary, connectivity=connectivity)
+    for l in range(1, num_labels):
+        xmin = stats[l, cv2.CC_STAT_LEFT]
+        ymin = stats[l, cv2.CC_STAT_TOP]
+        xmax = stats[l, cv2.CC_STAT_WIDTH] + xmin
+        ymax = stats[l, cv2.CC_STAT_HEIGHT] + ymin
+        area = stats[l, cv2.CC_STAT_AREA]
+        boxes.append([xmin, ymin, xmax, ymax, [int(centroids[l][0]), int(centroids[l][1])], area])
+    return boxes
+
+
+def find_boxes(binary, method='connected', debug=0):
+    #   寻找轮廓
+    if method == 'contour':
+        boxes = draw_contour(binary)
+    elif method == 'connected':
+        boxes = draw_connected_component(binary)
+    if debug == 1:
+        boxes.sort(key=lambda x: x[4][1])
+        for box in boxes:
+            width = box[2] - box[0]
+            height = box[3] - box[1]
+            w_to_h = width / height
+            area = box[-1]
+            centroid = box[-2]
+            area_ratio = area / (width * height)
+            print('width:{}, height:{}, centroid:{}, w_to_h:{}, area:{}, area ratio:{}'.
+                  format(width, height, centroid, w_to_h, area, area_ratio))
+    return boxes
+
+
+def find_marker_by_shape(boxes,
+                         shape_para={'height': (80, 10), 'w2h': (3, 0.6), 'area': (6000, 500), 'area_ratio': 0.5},
+                         debug=0):
+    #   通过形状参数寻找定位点
+    area_ratio_threshold = 0.96
+    max_height, min_height = shape_para['height']
+    max_w2h, min_w2h = shape_para['w2h']
+    max_area, min_area = shape_para['area']
+    min_area_ratio = shape_para['area_ratio']
+
+    markers = []
+    for box in boxes:
+        w = box[2] - box[0]
+        h = box[3] - box[1]
+
+        if box[-1] >= area_ratio_threshold*w*h and min_area <= box[-1] <= max_area:
+            markers.append(box)
+        elif min_height <= h <= max_height and min_w2h <= w/h <= max_w2h \
+                and min_area <= box[-1] <= max_area and box[-1] >= min_area_ratio*w*h:
+            markers.append(box)
+
+    if debug == 1:
+        markers.sort(reverse=True, key=lambda x: x[-1])
+        for box in markers:
+            width = box[2] - box[0]
+            height = box[3] - box[1]
+            w_to_h = width / height
+            area = box[-1]
+            centroid = box[-2]
+            area_ratio = area / (width * height)
+            print('width:{}, height:{}, centroid:{}, w_to_h:{}, area:{}, area ratio:{}'.
+                  format(width, height, centroid, w_to_h, area, area_ratio))
+    elif debug == 2:
+        for box in boxes:
+            markers.append(box)
+        for box in markers:
+            width = box[2] - box[0]
+            height = box[3] - box[1]
+            w_to_h = width / height
+            area = box[-1]
+            centroid = box[-2]
+            area_ratio = area / (width * height)
+            print('width:{}, height:{}, centroid:{}, w_to_h:{}, area:{}, area ratio:{}'.
+                  format(width, height, centroid, w_to_h, area, area_ratio))
+
+    return markers
+
+
+def find_box_list_by_position(box, box_list, method='h', shift_threshold=30, slope_threshold=0.2, area_threshold=0.28):
+    #   根据相近原则将box加入box_list中
+    if len(box_list) > 0:
+        if method == 'h':   # 水平分布
+            index_flag, distance = -1, shift_threshold
+            for index, bl in enumerate(box_list):
+                d = abs(box[4][1] - bl[-1][4][1])
+                if d < distance:
+                    distance = d
+                    index_flag = index
+            if index_flag >= 0:
+                box_list[index_flag].append(box)
+            else:
+                box_list.append([box])
+        elif method == 'v':     # 垂直分布
+            index_flag, distance = -1, shift_threshold
+            for index, bl in enumerate(box_list):
+                d = abs(box[4][0] - bl[-1][4][0])
+                if d < distance and d < abs(box[4][1] - bl[-1][4][1]) * slope_threshold:
+                    distance = d
+                    index_flag = index
+            if index_flag >= 0:
+                box_list[index_flag].append(box)
+            else:
+                box_list.append([box])
+        elif method == 's':     # 面积相近分布
+            index_flag, area_diff = -1, area_threshold
+            for index, bl in enumerate(box_list):
+                d = abs((box[-1] - bl[-1][-1]) / bl[-1][-1])
+                if d < area_diff:
+                    area_diff = d
+                    index_flag = index
+            if index_flag >= 0:
+                box_list[index_flag].append(box)
+            else:
+                box_list.append([box])
+    else:
+        box_list.append([box])
+    return box_list
+
+
+def collect_markers_by_position(boxes, method='h', shift_threshold=30, slope_threshold=0.2, area_threshold=0.28, debug=0):
+    #   按照相近位置排列定位点
+    box_list = []
+    if method == 'h':       # 按水平位置相近排列
+        boxes.sort(key=lambda x: x[4][0])
+        for b in boxes:
+            box_list = find_box_list_by_position(b, box_list, method=method, shift_threshold=shift_threshold,
+                                                 slope_threshold=slope_threshold)
+        box_list.sort(key=lambda x: x[0][4][1])
+    elif method == 'v':       # 按垂直位置相近排列
+        boxes.sort(key=lambda x: x[4][1])
+        for b in boxes:
+            box_list = find_box_list_by_position(b, box_list, method=method, shift_threshold=shift_threshold,
+                                                 slope_threshold=slope_threshold)
+        box_list.sort(key=lambda x: x[0][4][0])
+    elif method == 's':       # 按面积大小相近排列
+        boxes.sort(reverse=True, key=lambda x: x[-1])
+        for b in boxes:
+            box_list = find_box_list_by_position(b, box_list, method=method, shift_threshold=shift_threshold,
+                                                 slope_threshold=slope_threshold, area_threshold=area_threshold)
+        box_list.sort(reverse=True, key=lambda x: x[0][-1])
+
+    # if method == 'h':       # 按水平位置相近排列
+    #     boxes.sort(key=lambda x: x[4][1])
+    #     for b in boxes:
+    #         index_flag, distance = -1, shift_threshold
+    #         for index, single_list in enumerate(box_list):
+    #             if abs(b[4][1] - single_list[-1][4][1]) < distance:
+    #                 distance = abs(b[4][1] - single_list[-1][4][1])
+    #                 index_flag = index
+    #         if index_flag >= 0:
+    #             box_list[index_flag].append(b)
+    #         else:
+    #             box_list.append([b])
+    #
+    # elif method == 'v':     # 按垂直位置相近排列
+    #     boxes.sort(key=lambda x: x[4][0])
+    #     for b in boxes:
+    #         index_flag, distance = -1, shift_threshold
+    #         for index, single_list in enumerate(box_list):
+    #             if abs(b[4][0] - single_list[-1][4][0]) < distance:
+    #                 distance = abs(b[4][0] - single_list[-1][4][0])
+    #                 index_flag = index
+    #         if index_flag >= 0:
+    #             box_list[index_flag].append(b)
+    #         else:
+    #             box_list.append([b])
+
+    if debug == 1:
+        print('box list slope')
+        if method == 'h':
+            for box in box_list:
+                if len(box) >= 2:
+                    for i in range(len(box)-1):
+                        slope = (box[i+1][4][1] - box[i][4][1])/(box[i+1][4][0] - box[i][4][0])
+                        print(slope)
+        elif method == 'v':
+            for box in box_list:
+                if len(box) >= 2:
+                    for i in range(len(box) - 1):
+                        slope = (box[i + 1][4][0] - box[i][4][0]) / (box[i + 1][4][1] - box[i][4][1])
+                        print(slope)
+
+    return box_list
+
+
+def check_with_anchor(problem_markers, top_anchors, page_width, column_num):
+    #    根据top_anchors位置去除异常markers
+    min_shift = 100
+
+    column_pos = []
+
+    if len(top_anchors) == column_num + 1:
+        column_pos.append(top_anchors[0][4][0])
+        if top_anchors[1][4][0] - top_anchors[0][4][0] < top_anchors[-1][4][0] - top_anchors[-2][4][0]:
+            for i in range(2, column_num+1):
+                column_pos.append(top_anchors[i][4][0] - page_width)
+        else:
+            for i in range(1, column_num):
+                column_pos.append(top_anchors[i][4][0])
+        for index, markers in enumerate(problem_markers):
+            remove_list = []
+            for i in range(len(markers)//2):
+                if abs(markers[2*i][4][0]-column_pos[index]) > min_shift or \
+                        abs(markers[2*i+1][4][0]-column_pos[index]-page_width) > min_shift:
+                    remove_list.extend([2*i, 2*i+1])
+            problem_markers[index] = [problem_markers[index][i] for i in range(len(problem_markers[index]))
+                                      if i not in remove_list]
+
+    return problem_markers
+
+
+def remove_abnormal_marker(markers, page_width, debug=0):
+    #   从markers中剔除异常点
+    error = 10
+    max_std = 3
+    min_std = 0.1
+    min_area_ratio = 0.9
+    min_distance = 60
+    min_slope = 0.2
+    distance_list = []
+    remove_list = []
+
+    for i in range(len(markers)//2-1):
+        min_flag = i
+        distance_flag = abs(markers[2*i+1][4][0] - markers[2*i][4][0] - page_width) + abs(markers[2*i+1][4][1] -
+                                                                                          markers[2*i][4][1])
+        for j in range(i+1, len(markers)//2):
+            if abs(markers[2*i+1][4][0] - markers[2*j+1][4][0]) + abs(markers[2*i+1][4][1] - markers[2*j+1][4][1]) \
+                    < error:
+                if distance_flag < abs(markers[2*j+1][4][0] - markers[2*j][4][0] - page_width) + \
+                        abs(markers[2*j+1][4][1] - markers[2*j][4][1]):
+                    remove_list.extend([2*j, 2*j+1])
+                else:
+                    distance_flag = abs(markers[2*j+1][4][0] - markers[2*j][4][0] - page_width) + \
+                                    abs(markers[2*j+1][4][1] - markers[2*j][4][1])
+                    remove_list.extend([2*min_flag, 2*min_flag+1])
+                    min_flag = j
+    markers = [markers[i] for i in range(len(markers)) if i not in remove_list]
+
+    remove_list = []
+    if len(markers) >= 6:
+        left_slope_list = np.asarray([abs((markers[2 * i][4][0] - markers[2 * i + 2][4][0])
+                                          / (markers[2 * i][4][1] - markers[2 * i + 2][4][1]))
+                                      for i in range(len(markers)//2-1)])
+        right_slope_list = np.asarray([abs((markers[2 * i + 1][4][0] - markers[2 * i + 3][4][0]) /
+                                           (markers[2 * i + 1][4][1] - markers[2 * i + 3][4][1]))
+                                       for i in range(len(markers) // 2 - 1)])
+        left_slope_list = left_slope_list > min_slope
+        right_slope_list = right_slope_list > min_slope
+        for i in range(len(left_slope_list)):
+            if left_slope_list[i]:
+                if i == len(left_slope_list) - 1:
+                    if not left_slope_list[i-1]:
+                        remove_list.extend([2*(i+1), 2*(i+1)+1])
+                elif left_slope_list[i+1]:
+                    remove_list.extend([2*(i+1), 2*(i+1)+1])
+                elif not left_slope_list[i+1]:
+                    remove_list.extend([2*i, 2*i+1])
+        for i in range(len(right_slope_list)):
+            if right_slope_list[i]:
+                if i == len(right_slope_list) - 1:
+                    if not right_slope_list[i-1]:
+                        remove_list.extend([2*(i+1), 2*(i+1)+1])
+                elif right_slope_list[i+1]:
+                    remove_list.extend([2*(i+1), 2*(i+1)+1])
+                elif not right_slope_list[i+1]:
+                    remove_list.extend([2*i, 2*i+1])
+
+        markers = [markers[i] for i in range(len(markers)) if i not in set(remove_list)]
+
+    remove_list = []
+    if len(markers) >= 2:
+        left_x_list = np.asarray([markers[2*i][4][0] for i in range(len(markers)//2)])
+        left_y_list = np.asarray([markers[2*i][4][1] for i in range(len(markers)//2)])
+        right_x_list = np.asarray([markers[2*i+1][4][0] for i in range(len(markers)//2)])
+        right_y_list = np.asarray([markers[2*i+1][4][1] for i in range(len(markers)//2)])
+        distance_list = right_x_list - left_x_list
+        shift_list = right_y_list - left_y_list
+
+        left_x_mean = left_x_list.mean()
+        distance_mean = distance_list.mean()
+        shift_mean = shift_list.mean()
+        left_x_std = left_x_list.std()
+        distance_std = distance_list.std()
+        shift_std = shift_list.std()
+        if len(markers) >= 4:
+            for i in range(len(markers)//2):
+                if left_x_std > min_std and abs(left_x_list[i] - left_x_mean) / left_x_std > max_std:
+                    remove_list.extend([2*i, 2*i+1])
+                elif shift_std > min_std and abs(shift_list[i] - shift_mean) / shift_std > max_std:
+                    remove_list.extend([2 * i, 2 * i + 1])
+                elif distance_std > min_std and abs(distance_list[i] - distance_mean) / distance_std > max_std:
+                    remove_list.extend([2 * i, 2 * i + 1])
+        elif len(markers) == 2:
+            # area_ratio_list = np.asarray([m[-1]/((m[2]-m[0])*(m[3]-m[1])) for m in markers])
+            if abs(distance_list-page_width) + abs(shift_list) > min_distance:
+                remove_list.extend([0, 1])
+    markers = [markers[i] for i in range(len(markers)) if i not in remove_list]
+
+    if len(markers) >= 2:
+        new_page_width = markers[1][4][0] - markers[0][4][0]
+    else:
+        new_page_width = page_width
+
+    if debug == 1:
+        print(len(markers))
+        if len(markers) >= 4:
+            print('left', left_x_mean, left_x_std)
+            print(left_x_list)
+            for i, x in enumerate(left_x_list):
+                delta = abs(x - left_x_mean) / left_x_std
+                print(delta, left_y_list[i])
+            print('shift', shift_mean, shift_std)
+            print(shift_list)
+            for i, shift in enumerate(shift_list):
+                delta = abs(shift - shift_mean) / shift_std
+                print(delta, left_y_list[i])
+            print('distance', distance_mean, distance_std)
+            print(distance_list)
+            for i, distance in enumerate(distance_list):
+                delta = abs(distance - distance_mean) / distance_std
+                print(delta, left_y_list[i])
+            print('total')
+            for i in range(len(left_x_list)):
+                delta = abs(left_x_list[i] - left_x_mean) / left_x_std + abs(shift_list[i] - shift_mean) / shift_std \
+                        + abs(distance_list[i] - distance_mean) / distance_std
+                d = abs(left_x_list[i] - left_x_mean) + abs(shift_list[i] - shift_mean) + \
+                    abs(distance_list[i] - distance_mean)
+                print(delta, d, left_y_list[i])
+    if debug == 2:
+        if len(markers) >= 2:
+            print('page width', page_width, 'new page width:', new_page_width,
+                  'distant difference:', abs(new_page_width - page_width) + abs(markers[1][4][1]-markers[0][4][1]))
+
+    return markers, new_page_width
+
+
+def draw_box(image, boxes, color=(0, 255, 0), debug=0):
+    #   生成定位点标注框图
+    for box in boxes:
+        if len(box) > 0:
+            cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), color, 5)
+
+    if debug == 1:
+        for box in boxes:
+            if len(box) == 4:
+                width = box[2] - box[0]
+                height = box[3] - box[1]
+                w_to_h = width / height
+                # centroid = (box[0]+box[1])/2, (box[1]+box[3])/2
+                print('width:{}, height:{}, w_to_h:{}, top_left:{}, bottom_right:{},'.
+                      format(width, height, w_to_h, box[0:2], box[2:4]))
+            elif len(box) > 4:
+                width = box[2] - box[0]
+                height = box[3] - box[1]
+                w_to_h = width / height
+                position_ratio = box[4][0] / image.shape[1]
+                area = box[-1]
+                centroid = box[4]
+                area_ratio = area / (width * height)
+                print('width:{}, height:{}, centroid:{}, position ratio:{}, w_to_h:{}, area:{}, area ratio:{}'.
+                      format(width, height, centroid, position_ratio, w_to_h, area, area_ratio))
+
+
+def find_pair(marker, boxes, page_width, threshold=100):
+    #   若page_width为正,在boxes中找到marker的右配对, 若page_width为负, 在在boxes中找到marker的左配对
+
+    distance = threshold
+    pair_index = -1
+    for i in range(len(boxes)):
+        if abs(marker[4][1] - boxes[i][4][1]) + abs(boxes[i][4][0] - marker[4][0] - page_width) <= threshold:
+            if abs(marker[4][1] - boxes[i][4][1]) + abs(boxes[i][4][0] - marker[4][0] - page_width) < distance:
+                distance = abs(marker[4][1] - boxes[i][4][1]) + abs(boxes[i][4][0] - marker[4][0] - page_width)
+                pair_index = i
+    if pair_index >= 0:
+        return boxes[pair_index], pair_index, distance
+    else:
+        return [], pair_index, distance
+
+
+def find_pair_list(marker_list, all_list, page_width, horizontal_threshold=100, debug=0):
+    #   all_list 中找到与marker_list最接近的配对list
+    max_count = 0
+    index_flag = -1
+    min_distance = horizontal_threshold
+    for index, l in enumerate(all_list):
+        count = 0
+        distance = 0
+        for m in marker_list:
+            if find_pair(m, l, page_width, horizontal_threshold)[1] >= 0:
+                count += 1
+                distance += find_pair(m, l, page_width, horizontal_threshold)[2]
+        if count > max_count:
+            max_count = count
+            index_flag = index
+            min_distance = distance / count
+        elif count == max_count and count > 0:
+            distance /= count
+            if distance < min_distance:
+                min_distance = distance
+                index_flag = index
+    if debug == 1:
+        if index_flag >= 0:
+            print('page width:', abs(marker_list[0][4][0] - all_list[index_flag][0][4][0]), 'anchor width:', page_width)
+
+    return all_list[index_flag], index_flag, min_distance
+
+
+def find_column(anchors, width, column_num=2, debug=0):
+    #   确定栏数,单栏宽度及第一栏和最后一栏的定位
+    double_page_width_ratio = 0.42  # 默认双栏宽度比例
+    three_page_width_ratio = 0.29  # 默认三栏宽度比例
+    double_page_separation = 250  # 默认双栏栏间间距
+    three_page_separation = 100  # 默认三栏栏间间距
+    horizontal_threshold = 80  # 单栏宽度比例阈值
+
+    top_anchors, bottom_anchors = anchors[:2]
+
+    page_width = width
+    if len(top_anchors) >= 2:
+        for i in range(len(top_anchors)-1):
+            page_width_0 = top_anchors[i+1][4][0] - top_anchors[i][4][0]
+            page_width_1 = (top_anchors[i + 1][4][0] - top_anchors[i][4][0]) // 2
+            page_width_2 = (top_anchors[i + 1][4][0] - top_anchors[i][4][0]) // 3
+            if abs(page_width_0 - width * double_page_width_ratio) < horizontal_threshold:
+                column_num = 2
+                if page_width_0 < page_width:
+                    page_width = page_width_0
+            elif abs(page_width_0 - width * three_page_width_ratio) < horizontal_threshold:
+                column_num = 3
+                if page_width_0 < page_width:
+                    page_width = page_width_0
+            elif abs(page_width_1 - width * double_page_width_ratio) < horizontal_threshold:
+                column_num = 2
+                if page_width_1 < page_width:
+                    page_width = page_width_1
+            elif abs(page_width_1 - width * three_page_width_ratio) < horizontal_threshold:
+                column_num = 3
+                if page_width_1 < page_width:
+                    page_width = page_width_1
+            elif abs(page_width_2 - width * double_page_width_ratio) < horizontal_threshold:
+                column_num = 2
+                if page_width_2 < page_width:
+                    page_width = page_width_2
+            elif abs(page_width_2 - width * three_page_width_ratio) < horizontal_threshold:
+                column_num = 3
+                if page_width_2 < page_width:
+                    page_width = page_width_2
+    if page_width == width:
+        if column_num == 2:
+            page_width = int(width * double_page_width_ratio)    # 如果没有找到合适的大定位点,使用默认的双栏宽度
+        elif column_num == 3:
+            page_width = int(width * three_page_width_ratio)     # 如果没有找到合适的大定位点,使用默认的三栏宽度
+
+    #   寻找第一栏和最后一栏的定位
+    column_pos = []
+    if len(top_anchors) >= 1:
+        for i in range(4):
+            if top_anchors[0][4][0] - (i + 1) * page_width < 0:
+                column_pos.append(top_anchors[0][4][0] - i * page_width)
+                break
+        for i in range(4):
+            if top_anchors[-1][4][0] + (i + 1) * page_width > width:
+                column_pos.append(top_anchors[-1][4][0] + (i - 1) * page_width)
+                break
+    elif len(bottom_anchors) == 2:
+        column_pos = [bottom_anchors[0][4][0], bottom_anchors[-1][4][0] - page_width]
+    elif column_num == 2:
+        column_pos = [(width - double_page_separation) // 2 - page_width, (width + double_page_separation) // 2]
+    elif column_num == 3:
+        column_pos = [width // 2 - three_page_separation - page_width * 3 // 2,
+                      width // 2 + three_page_separation + page_width // 2]
+
+    if debug == 1:
+        print('top anchors')
+        for t in top_anchors:
+            print(t[4])
+        print('bottom anchors')
+        for b in bottom_anchors:
+            print(b[4])
+        print('page width:', page_width, 'column number:', column_num, 'column position:', column_pos)
+
+    return page_width, column_num, column_pos

+ 3 - 0
segment/sheet_resolve/analysis/choice/__init__.py

@@ -0,0 +1,3 @@
+# @Author  : lightXu
+# @File    : __init__.py.py
+# @Time    : 2018/11/21 0021 下午 16:01

+ 95 - 0
segment/sheet_resolve/analysis/choice/analysis_choice.py

@@ -0,0 +1,95 @@
+# @Author  : lightXu
+# @File    : analysis_choice.py
+import time
+
+import numpy as np
+
+from segment.sheet_resolve.lib.model.test import im_detect
+from segment.sheet_resolve.lib.model.nms_wrapper import nms
+from segment.sheet_resolve.lib.utils.timer import Timer
+from segment.sheet_resolve.tools import utils
+
+
+def analysis_single_image_with_regions(analysis_type, classes, sess, net,
+                                       im, conf_thresh, mns_thresh,
+                                       coordinate_bias_dict):
+    """Detect object classes in an image using pre-computed object proposals."""
+
+    size = im.shape
+
+    # Detect all object classes and regress object bounds
+    timer = Timer()
+    timer.tic()
+    im, radio = utils.img_resize(analysis_type, im)
+    scores, boxes = im_detect(analysis_type, sess, net, im)
+    timer.toc()
+    print('Detection took {:.3f}s for {:d} object proposals'.format(timer.total_time, boxes.shape[0]))
+
+    content_list = []
+    analysis_cls_list = []
+    for cls_ind, cls in enumerate(classes[1:]):  # classes
+        cls_ind += 1  # because we skipped background
+        cls_boxes = boxes[:, 4 * cls_ind:4 * (cls_ind + 1)]
+        cls_scores = scores[:, cls_ind]
+        dets = np.hstack((cls_boxes,
+                          cls_scores[:, np.newaxis])).astype(np.float32)
+        keep = nms(dets, mns_thresh)
+        dets = dets[keep, :]
+        # vis_detections(im, cls, dets, ax, thresh=conf_thresh)
+        inds = np.where(dets[:, -1] >= conf_thresh)[0]
+        if len(inds) > 0:
+            if cls in list(coordinate_bias_dict.keys()):
+                xmin_bias = coordinate_bias_dict[cls]['xmin_bias']
+                ymin_bias = coordinate_bias_dict[cls]['ymin_bias']
+                xmax_bias = coordinate_bias_dict[cls]['xmax_bias']
+                ymax_bias = coordinate_bias_dict[cls]['ymax_bias']
+            else:
+                xmin_bias = 0
+                ymin_bias = 0
+                xmax_bias = 0
+                ymax_bias = 0
+            for i in inds:
+                bbox = dets[i, :4]
+                score = '{:.4f}'.format(dets[i, -1])
+
+                xmin = int(int(bbox[0]) * radio[0]) + xmin_bias
+                ymin = int(int(bbox[1]) * radio[1]) + ymin_bias
+                xmax = int(int(bbox[2]) * radio[0]) + xmax_bias
+                ymax = int(int(bbox[3]) * radio[1]) + ymax_bias
+
+                xmin = (xmin if (xmin > 0) else 1)
+                ymin = (ymin if (ymin > 0) else 1)
+                xmax = (xmax if (xmax < size[1]) else size[1] - 1)
+                ymax = (ymax if (ymax < size[0]) else size[0] - 1)
+                xavg = int(xmin + (xmax - xmin) / 2)
+                yavg = int(ymin + (ymax - ymin) / 2)
+
+                bbox_dict = {"xmin": xmin, "ymin": ymin, "xmax": xmax, "ymax": ymax, 'x_center': xavg, 'y_center': yavg}
+                class_dict = {"class_name": cls, "bounding_box": bbox_dict, "score": score, 'x_center': xavg,
+                              'y_center': yavg}
+                content_list.append(class_dict)
+
+                analysis_cls_list.append(cls)
+
+    return content_list, analysis_cls_list
+
+
+def get_single_image_sheet_regions(analysis_type, im, classes,
+                                   sess, net, conf_thresh, mns_thresh,
+                                   coordinate_bias_dict):
+    start_time = time.time()
+
+    content, cls = analysis_single_image_with_regions(analysis_type, classes,
+                                                      sess, net,
+                                                      im, conf_thresh, mns_thresh,
+                                                      coordinate_bias_dict)
+
+    img_dict = {"img_name": 'choice_m',
+                'analysis_type': analysis_type,
+                "regions": content,
+                }
+
+    end_time = time.time()
+    print(end_time - start_time)
+
+    return img_dict

+ 490 - 0
segment/sheet_resolve/analysis/choice/choice_box.py

@@ -0,0 +1,490 @@
+# @Author  : lightXu
+# @File    : choice_box.py
+# @Time    : 2018/11/22 0022 下午 16:01
+import re
+import time
+import xml.etree.cElementTree as ET
+
+import cv2
+import numpy as np
+
+from segment.sheet_resolve.analysis.choice.choice_m_row_column import get_choice_m_row_and_col
+from segment.sheet_resolve.tools import utils
+from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate
+
+
+def get_interval(word_result_list):
+    all_char_str = ''
+    location = []
+    for i, chars_dict in enumerate(word_result_list):
+        chars_list = chars_dict['chars']
+        for ele in chars_list:
+            all_char_str = all_char_str + ele['char']
+            location.append(ele['location'])
+
+    pattern1 = re.compile(r"\]\[")
+    pattern2 = re.compile(r"\[[ABCD]")
+
+    def intervel(pattern):
+        group_list = []
+        for i in pattern.finditer(all_char_str):
+            # print(i.group() + str(i.span()))
+            group_list.append(list(i.span()))
+        # print(group_list)
+
+        sum_intervel = 0
+        size = 0
+        for group in group_list:
+            left_x, right_x = location[group[0]]['left'] \
+                              + location[group[0]]['width'], location[group[1] - 1]['left']
+            if abs(location[group[0]]['top'] - location[group[1]]['top']) < location[group[0]]['height']:
+                if right_x - left_x > 0:
+                    sum_intervel = sum_intervel + right_x - left_x
+                    size += 1
+
+        # print(sum_intervel // size)
+        return sum_intervel // size
+
+    intervel_width1 = intervel(pattern1)
+    intervel_width2 = intervel(pattern2)
+
+    return (intervel_width1 + intervel_width2) * 2 // 3
+
+
+def preprocess(image0, xe, ye):
+    scale = 0
+    dilate = 1
+    blur = 5
+    # 预处理图像
+    img = image0
+
+    # rescale the image
+    if scale != 0:
+        img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+
+    # Convert to gray
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    # # Apply dilation and erosion to remove some noise
+    # if dilate != 0:
+    #     kernel = np.ones((dilate, dilate), np.uint8)
+    #     img = cv2.dilate(img, kernel, iterations=1)
+    #     img = cv2.erode(img, kernel, iterations=1)
+
+    # Apply blur to smooth out the edges
+    # if blur != 0:
+    #     img = cv2.GaussianBlur(img, (blur, blur), 0)
+
+    # Apply threshold to get image with only b&w (binarization)
+    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+
+    # cv2.namedWindow('image', cv2.WINDOW_NORMAL)
+    # cv2.imshow('image', img)
+    # if cv2.waitKey(0) == 27:
+    #     cv2.destroyAllWindows()
+    # cv2.imwrite('otsu.jpg', img)
+
+    kernel = np.ones((ye, xe), np.uint8)  # y轴膨胀, x轴膨胀
+
+    dst = cv2.dilate(img, kernel, iterations=1)
+    # cv2.imshow('dilate', dst)
+    # if cv2.waitKey(0) == 27:
+    #     cv2.destroyAllWindows()
+
+    return dst
+
+
+def contours(image):
+    _, cnts, hierarchy = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    bboxes = []
+    for cnt_id, cnt in enumerate(reversed(cnts)):
+        x, y, w, h = cv2.boundingRect(cnt)
+        bboxes.append((x, y, x + w, y + h))
+
+    return bboxes
+
+
+def box_coordinates(img):
+    img_arr = np.asarray(img)
+
+    def axix_break_point(img, tolerance_number, axis):
+        sum_x_axis = img.sum(axis=axis)
+        sum_x_axis[sum_x_axis > 255 * tolerance_number] = 1  # 白色有字
+        sum_x_axis[sum_x_axis != 1] = 0  # 黑色无字
+        sum_x_axis_list = list(sum_x_axis)
+        sum_x_axis_list.append(0)  # 最后几行到结束有字时,使索引值增加最后一位
+
+        split_x_index = []
+        num = 1
+        for index, ele in enumerate(sum_x_axis_list):
+            num = num % 2
+            if ele == num:
+                # print(i)
+                num = num + 1
+                split_x_index.append(index)
+        # print('length: ', len(split_x_index), split_x_index)
+        return split_x_index
+
+    y_break_points_list = axix_break_point(img_arr, 1, axis=1)
+    x_break_points_list = axix_break_point(img_arr, 1, axis=0)
+
+    all_coordinates = []
+    for i in range(0, len(y_break_points_list), 2):  # y轴分组
+        ymin = y_break_points_list[i]
+        ymax = y_break_points_list[i + 1]
+        for j in range(0, len(x_break_points_list), 2):
+            xmin = x_break_points_list[j]
+            xmax = x_break_points_list[j + 1]
+            all_coordinates.append([xmin, ymin, xmax, ymax])
+
+    return all_coordinates
+
+
+def get_choice_box_coordinate(word_result_list, choice_img, cv_box_list, choice_bbox_list):
+    shape = choice_img.shape
+    y, x = shape[0], shape[1]
+
+    # cv2.imshow('ocr_region', ocr_region)
+    # if cv2.waitKey(0) == 27:
+    #     cv2.destroyAllWindows()
+
+    all_digital_list = []
+    digital_model = re.compile(r'\d')
+    for i, chars_dict in enumerate(word_result_list):
+        chars_list = chars_dict['chars']
+        for ele in chars_list:
+            if digital_model.search(ele['char']):
+                all_digital_list.append(ele)
+
+    new_all_digital_list = []
+    i = 1
+    while i <= len(all_digital_list):
+        pre_one = all_digital_list[i - 1]
+        if i == len(all_digital_list):
+            new_all_digital_list.append(pre_one)
+            break
+        rear_one = all_digital_list[i]
+        condition1 = abs(pre_one['location']['top'] - rear_one['location']['top']) < pre_one['location'][
+            'height']  # 两字高度差小于一字高度
+        condition2 = pre_one['location']['left'] + 2 * pre_one['location']['width'] > rear_one['location'][
+            'left']  # 某字宽度的2倍大于两字间间隔
+        if condition1:
+            if condition2:
+                new_char = pre_one['char'] + rear_one['char']
+                new_location = {'left': pre_one['location']['left'],
+                                'top': min(pre_one['location']['top'], rear_one['location']['top']),
+                                'width': rear_one['location']['left'] + rear_one['location']['width'] -
+                                         pre_one['location']['left'],
+                                'height': max(pre_one['location']['height'], rear_one['location']['height'])}
+                new_all_digital_list.append({'char': new_char, 'location': new_location})
+                i = i + 1 + 1
+            else:
+                new_all_digital_list.append(pre_one)
+                i = i + 1
+        else:
+            new_all_digital_list.append(pre_one)  # 遇到字符y轴相差过大就结束
+            i = i + 1
+
+    content_list = list()
+    for index, box in enumerate(choice_bbox_list['regions']):  # rcnn识别的框匹配题号
+        box = box['bounding_box']
+        box_coordinate = (box['xmin'], box['ymin'], box['xmax'], box['ymax'])
+        horizontal = box['xmax'] - box['xmin'] >= box['ymax'] - box['ymin']
+        vertical = box['xmax'] - box['xmin'] < box['ymax'] - box['ymin']
+        choice_number = {'number': 99, 'location': box_coordinate}
+        content_list.insert(index, choice_number)
+        for digital in new_all_digital_list:
+            digital_coordiante = (digital['location']['left'], digital['location']['top'],
+                                  digital['location']['left'] + digital['location']['width'],
+                                  digital['location']['top'] + digital['location']['height'])
+
+            if utils.decide_coordinate_contains(digital_coordiante, box_coordinate):
+                if horizontal:
+                    box['xmin'] = digital['location']['left'] + digital['location']['width'] + 1  # 从数字处截取
+                if vertical:
+                    box['ymin'] = digital['location']['top'] + digital['location']['height'] + 1
+
+                box_coordinate = (box['xmin'], box['ymin'], box['xmax'], box['ymax'])
+                content_list[index]['number'] = digital['char']
+                content_list[index]['location'] = box_coordinate
+                break
+
+    for box in content_list:
+        box_coordinate = (box['location'][0], box['location'][1], box['location'][2], box['location'][3])
+        mtx = []
+        for cv_box in cv_box_list:
+            if utils.decide_coordinate_contains(cv_box, box_coordinate):  # 若fasterrcnn未识别到选项框,单独的ABCD也舍去
+                mtx.append(cv_box)
+
+        matrix = np.asarray(sorted(mtx))
+        dif = matrix[1:, 0] - matrix[:-1, 2]  # 后一个char的left与起一个char的right的差
+        dif[dif < 0] = 0
+        dif_length = np.mean(dif)  # 小于平均间隔的合并
+        block_list = utils.box_by_x_intervel(matrix, dif_length)
+        # block_list = utils.box_by_x_intervel(matrix, 5)
+        box['abcd'] = block_list
+
+    return content_list
+
+
+def choice(left, top, image, choice_bbox_list, xml_path):
+    a_z = '_ABCDEFGHIJKLMTUNOPQRSVWXYZ'
+    t1 = time.time()
+    word_result_list0 = get_ocr_text_and_coordinate(image, ocr_accuracy='accurate', language_type='ENG')
+    t2 = time.time()
+    print('choice ocr time cost: ', t2 - t1)
+    # print(word_result_list0)
+
+    # try:
+    #     intervel_x = get_interval(word_result_list0)
+    # except Exception:
+    #     intervel_x = 15
+    intervel_x = 3
+    img = preprocess(image, intervel_x, 3)
+    cv_box_list0 = box_coordinates(img)
+
+    content_list = get_choice_box_coordinate(word_result_list0, image, cv_box_list0, choice_bbox_list)
+
+    tree = ET.parse(xml_path)  # xml tree
+
+    w = content_list[0]['location'][2] - content_list[0]['location'][0]
+    h = content_list[0]['location'][3] - content_list[0]['location'][1]
+
+    def xml(xml_tree, sorted_abcd_list, bias=0):
+        ii = 0
+        for i, choice_bbox in enumerate(sorted_abcd_list):
+            area = (choice_bbox[2] - choice_bbox[0]) * (choice_bbox[3] - choice_bbox[1])
+            if area > 400:
+                name = '{:02d}_{}'.format(int(choice['number']), a_z[ii + bias])
+                xml_tree = utils.create_xml(name, xml_tree,
+                                            choice_bbox[0] + left, choice_bbox[1] + top, choice_bbox[2] + left,
+                                            choice_bbox[3] + top)
+                ii += 1
+        return xml_tree
+
+    def get_json(ajson_list, sorted_abcd_list, bias=0):
+        ii = 0
+        for i, choice_bbox in enumerate(sorted_abcd_list):
+            area = (choice_bbox[2] - choice_bbox[0]) * (choice_bbox[3] - choice_bbox[1])
+            if area > 400:
+                name = '{:02d}_{}'.format(int(choice['number']), a_z[ii + bias])
+                region = [choice_bbox[0] + left, choice_bbox[1] + top, choice_bbox[2] + left, choice_bbox[3] + top]
+                ajson_list.append({'number': name, 'region': region})
+                ii += 1
+        return ajson_list
+
+    json_list = []
+    for index_num, choice in enumerate(content_list):
+        abcd = choice['abcd']
+        if int(choice['number']) == 99:
+            if w >= h:
+                tree = xml(tree, sorted(abcd))
+                json_list = get_json(json_list, sorted(abcd))
+
+            else:
+                tree = xml(tree, sorted(abcd, key=lambda x: (x[1], x[0])))
+                json_list = get_json(json_list, sorted(abcd, key=lambda x: (x[1], x[0])))
+
+        else:
+            if w >= h:
+                tree = xml(tree, sorted(abcd), bias=1)
+                json_list = get_json(json_list, sorted(abcd), bias=1)
+
+            else:
+                tree = xml(tree, sorted(abcd, key=lambda x: (x[1], x[0])), bias=1)
+                json_list = get_json(json_list, sorted(abcd, key=lambda x: (x[1], x[0])), bias=1)
+
+    tree.write(xml_path)
+    return json_list
+
+
+def get_number_by_enlarge_choice_m(image, choice_m_region_list, xml_path):
+    a_z = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+
+    choice_m_dict_list = []  # choice_m region with same index
+    choice_m_enlarge = []
+    left, top, right, bottom = 9999, 9999, 0, 0
+    for _, box in enumerate(choice_m_region_list):
+        box = box['bounding_box']
+        m_left, m_top = box['xmin'], box['ymin'],
+        width, height = box['xmax'] - box['xmin'], box['ymax'] - box['ymin']
+
+        box_coordinate = (m_left, m_top, box['xmax'], box['ymax'])
+        single_choice_m = utils.crop_region_direct(image, box_coordinate)
+        row_col_dict = get_choice_m_row_and_col(m_left, m_top, single_choice_m)
+        choice_m_dict_list.append(row_col_dict)
+
+        box_coordinate_enlarge = (
+            m_left - int(width / 2), m_top - int(height / 2), box['xmax'], box['ymax'])  # 扩大的choice_m, 多个分散choice_m
+        choice_m_enlarge.append(box_coordinate_enlarge)
+        left = min(left, box_coordinate_enlarge[0])
+        top = min(top, box_coordinate_enlarge[1])
+        right = max(right, box_coordinate_enlarge[2])
+        bottom = max(bottom, box_coordinate_enlarge[3])
+
+    choice_whole_region = utils.crop_region_direct(image, (left, top, right, bottom))
+    # cv2.imwrite(r'C:\Users\Administrator\Desktop\test\sheet\choice_enlarge.jpg', choice_whole_region)
+    # cv2.imshow('img', choice_whole_region)
+    # cv2.waitKey(0)
+    # cv2.destroyAllWindows()
+
+    choice_region_text = get_ocr_text_and_coordinate(choice_whole_region)
+    all_digital_list = []
+    pattern = re.compile(r'\d')
+    for i, chars_dict in enumerate(choice_region_text):
+        chars_list = chars_dict['chars']
+        for ele in chars_list:
+            if pattern.search(ele['char']):
+                all_digital_list.append(ele)
+
+    combined_digital_list = utils.combine_char(all_digital_list)
+    direction_list = []
+    for index, enlarge_box in enumerate(choice_m_enlarge):
+        digital_list = []
+        xmin, ymin, xmax, ymax = 9999, 9999, 0, 0
+
+        choice_m_dict = choice_m_dict_list[index]
+        choice_m_dict_box = (choice_m_dict['bounding_box']['xmin'], choice_m_dict['bounding_box']['ymin'],
+                             choice_m_dict['bounding_box']['xmax'], choice_m_dict['bounding_box']['ymax'],)
+
+        for jndex, digital_box in enumerate(combined_digital_list):
+            digital_coordinate = (digital_box['location']['left'] + left,
+                                  digital_box['location']['top'] + top,
+                                  digital_box['location']['left'] + digital_box['location']['width'] + left,
+                                  digital_box['location']['top'] + digital_box['location']['height'] + top)
+            digital_box.update({'coordinate': digital_coordinate})
+            if (utils.decide_coordinate_contains(digital_coordinate, enlarge_box)) and not \
+                    (utils.decide_coordinate_contains(digital_coordinate, choice_m_dict_box)):
+                digital_list.append(digital_box)
+                xmin = min(xmin, digital_box['coordinate'][0])
+                ymin = min(ymin, digital_box['coordinate'][1])
+                xmax = max(xmax, digital_box['coordinate'][2])
+                ymax = max(ymax, digital_box['coordinate'][3])
+
+        digital_list_coordinate = (xmin, ymin, xmax, ymax)
+
+        direction = utils.decide_choice_m_left_top(digital_list_coordinate, choice_m_dict_box)
+        if int(direction):
+            choice_m_dict['direction'] = direction
+            direction_list.append(direction)
+            if direction == '180':  # 数字垂直排列
+                std_num_length = choice_m_dict['rows']
+                choice_option = a_z[:choice_m_dict['cols']].replace('', ',')[1:-1]
+                default_points = [-1] * std_num_length
+                choice_m_dict.update({'option': choice_option, 'default_points': default_points})
+
+                sorted(digital_list, key=lambda k: k.get('coordinate')[1])
+                choice_ymin = choice_m_dict['bounding_box']['ymin']
+                single_height = choice_m_dict['single_height']
+                mean_interval = ((choice_m_dict['bounding_box']['ymax'] - choice_m_dict['bounding_box']['ymin'])
+                                 - single_height * std_num_length) / (std_num_length - 1)
+                spilt_index = [choice_ymin - mean_interval / 2 + (single_height + mean_interval) * ele for ele in
+                               range(std_num_length + 1)]
+
+                number_list = [-1] * std_num_length
+                number_location = [(-1, -1, -1, -1)] * std_num_length
+                for i in range(0, len(spilt_index) - 1):
+                    start = spilt_index[i]
+                    end = spilt_index[i + 1]
+                    number_location[i] = (xmin, start, xmax, end)
+                    for digital_coordinate in digital_list:
+                        middle_y = (digital_coordinate['coordinate'][3] - digital_coordinate['coordinate'][1]) / 2 + \
+                                   digital_coordinate['coordinate'][1]
+                        middle_x = (digital_coordinate['coordinate'][2] - digital_coordinate['coordinate'][0]) / 2 + \
+                                   digital_coordinate['coordinate'][0]
+                        if (start <= middle_y <= end
+                                and
+                                middle_x < choice_m_dict['bounding_box']['xmin']):  # 数字在choice_m外侧
+                            number_list[i] = int(digital_coordinate['char'])
+                            number_location[i] = digital_coordinate['coordinate']
+
+                number_list = _infer_number(number_list)
+                choice_m_dict['number'] = _infer_number(number_list)
+                # choice_m_dict['number'] = [{'number': number,
+                #                             'location': {'xmin': xi, 'ymin': yi, 'xmax': xm, 'ymax': ym}}
+                #                            for number in number_list
+                #                            for (xi, yi, xm, ym) in number_location]
+
+            if direction == '90':  # 数字水平排列
+                std_num_length = choice_m_dict['cols']
+                choice_option = a_z[:std_num_length].replace('', ',')[1:-1]
+                default_points = [-1] * std_num_length
+                choice_m_dict.update({'option': choice_option, 'default_points': default_points})
+
+                sorted(digital_list, key=lambda k: k.get('coordinate')[0])
+                choice_xmin = choice_m_dict['bounding_box']['ymin']
+                single_width = choice_m_dict['single_width']
+                mean_interval = ((choice_m_dict['bounding_box']['xmax'] - choice_m_dict['bounding_box']['xmin'])
+                                 - single_width * std_num_length) / (std_num_length - 1)
+                spilt_index = [choice_xmin - mean_interval / 2 + (single_width + mean_interval) * ele for ele in
+                               range(std_num_length)]
+                number_list = [-1] * std_num_length
+                number_location = [(-1, -1, -1, -1)] * std_num_length
+                for i in range(0, len(spilt_index) - 1):
+                    start = spilt_index[i]
+                    end = spilt_index[i + 1]
+                    number_location[i] = (start, ymin, end, ymax)
+                    for digital_coordinate in digital_list:
+                        middle_y = (digital_coordinate['coordinate'][3] - digital_coordinate['coordinate'][1]) / 2 + \
+                                   digital_coordinate['coordinate'][1]
+                        middle_x = (digital_coordinate['coordinate'][2] - digital_coordinate['coordinate'][0]) / 2 + \
+                                   digital_coordinate['coordinate'][0]
+                        if start <= middle_x <= end and middle_y < choice_m_dict['bounding_box']['ymin']:
+                            number_list[i] = int(digital_coordinate['char'])
+                            number_location[i] = digital_coordinate['coordinate']
+
+                number_list = _infer_number(number_list)
+                choice_m_dict['number'] = _infer_number(number_list)
+
+                # choice_m_dict['number'] = [{'number': number,
+                #                             'location': {'xmin': xi, 'ymin': yi, 'xmax': xm, 'ymax': ym}}
+                #                            for number in number_list
+                #                            for (xi, yi, xm, ym) in number_location]
+
+        else:
+            choice_m_dict['direction'] = '0'
+            choice_m_dict['number'] = [-1]
+            choice_m_dict['default_points'] = [-1]
+
+    count180 = ','.join(direction_list).count('180')
+    count90 = ','.join(direction_list).count('90')
+
+    infer_direction = ['180', '90'][[count180, count90].index(max(count180, count90))]
+    for ele in choice_m_dict_list:
+        if ele['direction'] != '0':
+            ele.update({'direction': infer_direction})
+
+    # tree = ET.parse(xml_path)  # xml tree
+    # for index_num, choice_box in enumerate(choice_m_dict_list):
+    #     if len(choice_box['bounding_box']) > 0:
+    #         abcd = choice_box['bounding_box']
+    #         number = str(choice_box['number'])
+    #         name = '{}_{}*{}_{}_{}'.format('choice_m', choice_box['rows'],
+    #                                        choice_box['cols'], choice_box['direction'],
+    #                                        number)
+    #         tree = utils.create_xml(name, tree,
+    #                                 abcd['xmin'], abcd['ymin'],
+    #                                 abcd['xmax'], abcd['ymax'])
+    #
+    # tree.write(xml_path)
+    return choice_m_dict_list
+
+
+def _infer_number(number_list):
+    if -1 not in number_list or sum(number_list) == -1 * len(number_list):
+        return number_list
+    else:
+        for n_index in range(0, len(number_list) - 1):
+            if n_index == 0:
+                if number_list[n_index] != -1:
+
+                    if len(number_list) > 1 and number_list[n_index + 1] == -1:
+                        number_list[n_index + 1] = number_list[n_index] + 1
+
+            if number_list[n_index] != -1:
+                if number_list[n_index - 1] == -1:
+                    number_list[n_index - 1] = number_list[n_index] - 1
+                if number_list[n_index + 1] == -1:
+                    number_list[n_index + 1] = number_list[n_index] + 1
+        return _infer_number(number_list)

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 256 - 0
segment/sheet_resolve/analysis/choice/choice_line_box.py


+ 211 - 0
segment/sheet_resolve/analysis/choice/choice_m_row_column.py

@@ -0,0 +1,211 @@
+# @Author  : liu fan
+import numpy as np
+import tensorflow as tf
+
+from segment.sheet_resolve.lib.ssd_model.utils import label_map_util, ops as utils_ops
+from segment.sheet_resolve.tools import tf_settings
+
+from segment.sheet_resolve.tools.tf_sess import SsdSess
+from PIL import Image
+
+tf_sess_dict = {
+    'choice_ssd': SsdSess('choice_ssd'),
+}
+
+choice_ssd_sess = tf_sess_dict['choice_ssd']
+sess = choice_ssd_sess.sess
+detection_graph = choice_ssd_sess.graph
+
+
+def load_image_into_numpy_array(image):
+    # print(image)
+    image = image.convert('RGB')
+    (im_width, im_height) = image.size
+    return np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8)
+
+
+def run_inference_for_single_image(image):
+    ops = detection_graph.get_operations()
+    all_tensor_names = {output.name for op in ops for output in op.outputs}
+    tensor_dict = {}
+    for key in [
+        'num_detections', 'detection_boxes', 'detection_scores',
+        'detection_classes', 'detection_masks'
+    ]:
+        tensor_name = key + ':0'
+        if tensor_name in all_tensor_names:
+            tensor_dict[key] = detection_graph.get_tensor_by_name(
+                tensor_name)
+    if 'detection_masks' in tensor_dict:
+        # The following processing is only for single image
+        detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
+        detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
+        # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
+        real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
+        detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
+        detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
+        detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
+            detection_masks, detection_boxes, image.shape[0], image.shape[1])
+        detection_masks_reframed = tf.cast(
+            tf.greater(detection_masks_reframed, 0.5), tf.uint8)
+        # Follow the convention by adding back the batch dimension
+        tensor_dict['detection_masks'] = tf.expand_dims(
+            detection_masks_reframed, 0)
+    image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
+
+    # Run inference
+    # start = time.time()
+    output_dict = sess.run(tensor_dict,
+                           feed_dict={image_tensor: np.expand_dims(image, 0)})
+    # print(time.time()-start)
+    # all outputs are float32 numpy arrays, so convert types as appropriate
+    output_dict['num_detections'] = int(output_dict['num_detections'][0])
+    output_dict['detection_classes'] = output_dict[
+        'detection_classes'][0].astype(np.uint8)
+    output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
+    output_dict['detection_scores'] = output_dict['detection_scores'][0]
+    if 'detection_masks' in output_dict:
+        output_dict['detection_masks'] = output_dict['detection_masks'][0]
+    return output_dict
+
+
+def image_detect(image_np, category, score_threshold):
+    image_np = load_image_into_numpy_array(image_np)
+    detections = []
+    w, h = image_np.shape[1], image_np.shape[0]
+    with tf.device("/device:GPU:{}".format(0)):
+        output_dict = run_inference_for_single_image(image_np)
+    boxes = output_dict['detection_boxes']
+    scores = output_dict['detection_scores']
+    labels = output_dict['detection_classes']
+    indices = np.where(scores > score_threshold)
+    image_scores = scores[indices]
+    image_boxes = boxes[indices]
+    image_labels = labels[indices]
+    image_detections = np.concatenate(
+        [image_boxes, np.expand_dims(image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1)
+    for detection in image_detections:
+        y0 = int(detection[0] * h)
+        x0 = int(detection[1] * w)
+        y1 = int(detection[2] * h)
+        x1 = int(detection[3] * w)
+        label_index = int(detection[5])
+        label_name = category[label_index]['name']
+        detections.append((x0, y0, x1, y1, label_index, detection[4], label_name))
+    return detections
+
+
+def get_choice_m_row_and_col(left, top, image):
+    im_resize = 300
+    ''' choice_m resize to 300*300'''
+    image_src = Image.fromarray(image)
+    if image_src.mode == 'RGB':
+        image_src = image_src.convert("L")
+    w, h = image_src.size
+    if h > w:
+        image_src = image_src.resize((int(im_resize / h * w), im_resize))
+    else:
+        image_src = image_src.resize((im_resize, int(im_resize / w * h)))
+    w_, h_ = image_src.size
+    image_300 = Image.new(image_src.mode, (im_resize, im_resize), (255))
+    image_300.paste(image_src, [0, 0, w_, h_])
+
+    category_index = label_map_util.create_category_index_from_labelmap(tf_settings.choice_m_ssd_label,
+                                                                        use_display_name=True)
+    detections = image_detect(image_300, category_index, 0.5)
+    if len(detections) > 1:
+        box_xmin = []
+        box_ymin = []
+        box_xmax = []
+        box_ymax = []
+        x_distance_all = []
+        y_distance_all = []
+        x_width_all = []
+        y_height_all = []
+        all_small_coordinate = []
+        ssd_column = 1
+        ssd_row = 1
+        count_x = 0
+        count_y = 0
+        for index, box in enumerate(detections):
+            if box[-1] != 'T' and box[2] <= w_ and box[3] <= h_:
+                box0 = round(box[0] * (w / w_))  # Map to the original image
+                box1 = round(box[1] * (h / h_))
+                box2 = round(box[2] * (w / w_))
+                box3 = round(box[3] * (h / h_))
+                box_xmin.append(box0)
+                box_ymin.append(box1)
+                box_xmax.append(box2)
+                box_ymax.append(box3)
+                small_coordinate = {'xmin': box0 + left,
+                                    'ymin': box1 + top,
+                                    'xmax': box2 + left,
+                                    'ymax': box3 + top}
+                all_small_coordinate.append(small_coordinate)
+                x_width = box2 - box0
+                y_height = box3 - box1
+                x_width_all.append(x_width)
+                y_height_all.append(y_height)
+
+        sorted_xmin = sorted(box_xmin)
+        sorted_ymin = sorted(box_ymin)
+        sorted_xmax = sorted(box_xmax)
+        sorted_ymax = sorted(box_ymax)
+
+        x_width_all_sorted = sorted(x_width_all, reverse=True)
+        y_height_all_sorted = sorted(y_height_all, reverse=True)
+        len_x = len(x_width_all)
+        len_y = len(y_height_all)
+        x_width_median = np.median(x_width_all_sorted)
+        y_height_median = np.median(y_height_all_sorted)
+
+        for i in range(len(sorted_xmin) - 1):
+            x_distance = abs(sorted_xmin[i + 1] - sorted_xmin[i])
+            y_distance = abs(sorted_ymin[i + 1] - sorted_ymin[i])
+            if x_distance > 20:
+                ssd_column = ssd_column + 1
+                x_distance_all.append(x_distance)
+                if x_distance > 2 * x_width_median + 4:
+                    count_x = count_x + 1
+            if y_distance > 10:
+                ssd_row = ssd_row + 1
+                y_distance_all.append(y_distance)
+                if y_distance > 2 * y_height_median + 3:
+                    count_y = count_y + 1
+            if x_width_all_sorted[i] - x_width_median > 40:
+                ssd_column = ssd_column - 1
+            elif x_width_median - x_width_all_sorted[i] > 40:
+                ssd_column = ssd_column - 1
+            if y_height_all_sorted[i] - y_height_median > 20:
+                ssd_row = ssd_row - 1
+            elif y_height_median - y_height_all_sorted[i] > 20:
+                ssd_row = ssd_row - 1
+
+        if count_x < len(x_distance_all) / 2 + 1:
+            ssd_column = ssd_column + count_x
+        elif count_y < len(y_distance_all) / 2 + 1:
+            ssd_row = ssd_row + count_y
+
+        average_height = int(np.mean(y_height_all))
+        average_width = int(np.mean(x_width_all))
+
+        # average_height = format(np.mean(y_height_all), '.2f')
+        # average_width = format(np.mean(x_width_all), '.2f')
+        # average_height = int(np.mean(y_distance_all))
+        # average_width = int(np.mean(x_distance_all))
+        location_ssd = {'xmin': sorted_xmin[0] + left,
+                        'ymin': sorted_ymin[0] + top,
+                        'xmax': sorted_xmax[-1] + left,
+                        'ymax': sorted_ymax[-1] + top}
+
+        choice_m_ssd = {'bounding_box': location_ssd,
+                        "single_height": average_height,
+                        "single_width": average_width,
+                        "rows": ssd_row,
+                        "cols": ssd_column,
+                        'class_name': 'choice_m',
+                        'all_small_coordinate': all_small_coordinate
+                        }
+    else:
+        choice_m_ssd = {}
+    return choice_m_ssd

+ 496 - 0
segment/sheet_resolve/analysis/choice/get_title_number_by_choice_m.py

@@ -0,0 +1,496 @@
+from segment.sheet_resolve.tools import utils
+from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate, get_ocr_text_and_coordinate0
+import numpy as np
+import re, os
+import xml.etree.cElementTree as ET
+import cv2
+
+
+def combine_char(all_digital_list):
+    new_all_digital_list = []
+    i = 1
+    while i <= len(all_digital_list):
+        pre_one = all_digital_list[i - 1]
+        if i == len(all_digital_list):
+            new_all_digital_list.append(pre_one)
+            break
+        rear_one = all_digital_list[i]
+        condition1 = abs(pre_one['location']['top'] - rear_one['location']['top']) < pre_one['location'][
+            'height']  # 两字高度差小于一字高度
+        condition2 = pre_one['location']['left'] + 1.8 * pre_one['location']['width'] > rear_one['location'][
+            'left']  # 某字宽度的2倍大于两字间间隔
+        if condition1:
+            if condition2:
+                new_char_list = [pre_one, rear_one]
+                new_char_list = sorted(new_char_list, key=lambda k: k.get('location')['left'])
+                pre_one = new_char_list[0]
+                rear_one = new_char_list[1]
+                new_char = pre_one['char'] + rear_one['char']
+                new_location = {'left': pre_one['location']['left'],
+                                'top': min(pre_one['location']['top'], rear_one['location']['top']),
+                                'width': rear_one['location']['left'] + rear_one['location']['width'] -
+                                         pre_one['location']['left'],
+                                'height': max(pre_one['location']['height'], rear_one['location']['height'])}
+                new_all_digital_list.append({'char': new_char, 'location': new_location})
+                i = i + 1 + 1
+            else:
+                new_all_digital_list.append(pre_one)
+                i = i + 1
+        else:
+            new_all_digital_list.append(pre_one)  # 遇到字符y轴相差过大就结束
+            i = i + 1
+    return new_all_digital_list
+
+
+def get_x_diff_and_y_diff0(single_choice_m_coordinates):
+    single_choice_m_matrix = np.array(single_choice_m_coordinates)
+    x_diff = single_choice_m_matrix[1:, 0] - single_choice_m_matrix[:-1, 2]
+    x_diff_ = [ele for ele in x_diff.tolist() if ele < 0]
+    xx = [ele for ele in x_diff.tolist() if ele not in x_diff_]
+    x_interval = int(np.mean(xx))
+    return x_interval
+
+
+def get_x_diff_and_y_diff(single_choice_m_coordinates):
+    single_choice_m_matrix = np.array(single_choice_m_coordinates)
+    x_diff = single_choice_m_matrix[1:, 0] - single_choice_m_matrix[:-1, 2]
+    x_diff_ = [ele for ele in x_diff.tolist() if ele < 0]
+    xx = [ele for ele in x_diff.tolist() if ele not in x_diff_]
+    x_dif_length = int(np.mean(xx))
+
+    yy_diff = single_choice_m_matrix[1:, 1] - single_choice_m_matrix[:-1, 3]
+    y_diff_ = [ele for ele in yy_diff.tolist() if ele < 0]
+    yy = [ele for ele in yy_diff.tolist() if ele not in y_diff_]
+    y_dif_length = int(np.mean(yy))
+    x_y_interval = (x_dif_length, y_dif_length)
+    return x_y_interval
+
+
+def choice_bbox_vague(choice_m_absolute_box, x_y_interval, single_width, single_height, direction):
+    xmin0 = [ele[0] for ele in choice_m_absolute_box]
+    ymin0 = [ele[1] for ele in choice_m_absolute_box]
+    xmax0 = [ele[2] for ele in choice_m_absolute_box]
+    ymax0 = [ele[3] for ele in choice_m_absolute_box]
+
+    if direction == 180:
+        x_diff = x_y_interval[0]
+        s_width = single_height
+        choice_bbox = (np.hstack((np.array([min(xmin0) - x_diff - s_width, min(ymin0)]), np.array([max(xmax0), max(ymax0)])))).tolist()
+        return choice_bbox
+    elif direction == 90:
+        y_diff = x_y_interval[1]
+        s_height = single_width
+        choice_bbox = (np.hstack((np.array([min(xmin0), min(ymin0) - y_diff - s_height]), np.array([max(xmax0), max(ymax0)])))).tolist()
+        return choice_bbox
+
+
+def get_digital_near_choice_m_box(new_all_digital_list, choice_m_new_bbox, x_y_interval_ave, singe_box_width_height_ave, direction):
+    digital_list_by_choice_m = []
+
+    for i, c_ele in enumerate(choice_m_new_bbox):
+        c_box = c_ele['bounding_box']
+        title_number_list = []
+        title_number_dict = {}
+        for j, d_ele in enumerate(new_all_digital_list):
+            d_location = d_ele['location']
+            if direction == 180:
+                if utils.decide_coordinate_left(d_location, c_box, x_y_interval_ave, singe_box_width_height_ave) == True:
+                    title_number_list.append(d_ele)
+            elif direction == 90:
+                if utils.decide_coordinate_top(d_location, c_box, x_y_interval_ave, singe_box_width_height_ave) == True:
+                    title_number_list.append(d_ele)
+
+        title_number_dict['bounding_box'] = c_box
+        title_number_dict['title_number'] = title_number_list
+
+        digital_list_by_choice_m.append(title_number_dict)
+
+    return digital_list_by_choice_m
+
+
+def move_intersect_box(all_small_coordinate_list):
+    all_small_coordinate_list = sorted(all_small_coordinate_list, key=lambda k: k[0])
+    all_small_coordinate_list_temp = all_small_coordinate_list.copy()
+
+    del_list = []
+    new_box = []
+    for i, outer in enumerate(all_small_coordinate_list_temp):
+        for j, inner in enumerate(all_small_coordinate_list_temp):
+            if i == j:
+                continue
+            else:
+                if utils.get_min_distance(inner, outer) == 'i':
+                    inner_outer_list = [inner, outer]
+                    inner_outer_list = sorted(inner_outer_list, key=lambda k: k[0])
+
+                    if abs(inner_outer_list[0][2] - inner_outer_list[1][0]) > int(inner[2] - inner[0]) // 4:
+                        del_list.append(inner)
+                        del_list.append(outer)
+                        new_box_xmin = (inner[0] + outer[0]) // 2
+                        new_box_ymin = (inner[1] + outer[1]) // 2
+                        new_box_xmax = (inner[2] + outer[2]) // 2
+                        new_box_ymax = (inner[3] + outer[3]) // 2
+                        new_box.append([new_box_xmin, new_box_ymin, new_box_xmax, new_box_ymax])
+                    else:
+                        continue
+    del_list0 = [list(t) for t in set(tuple(ele) for ele in del_list)]
+    del_list0.sort(key=del_list.index)
+    new_list0 = [list(t) for t in set(tuple(ele) for ele in new_box)]
+    new_list0.sort(key=new_box.index)
+
+    all_small_coordinate = []
+    for ele in all_small_coordinate_list_temp:
+        if ele in all_small_coordinate_list_temp:
+            if ele not in del_list0:
+                all_small_coordinate.append(ele)
+    for ele in new_list0:
+        all_small_coordinate.append(ele)
+    return all_small_coordinate
+
+
+def get_one_line_box(all_small_coordinate_list, height):
+    all_small_coordinate_raw = sorted(all_small_coordinate_list, key=lambda k: k[1])
+    all_small_coordinate_raw_array = np.array(all_small_coordinate_raw)
+
+    pre = all_small_coordinate_raw_array[1:, 1]
+    rear = all_small_coordinate_raw_array[:-1, 1]
+    y_diff = rear - pre
+    index_list = [index for index, ele in enumerate(y_diff) if ele < 0 and abs(ele) > height // 3]
+
+    res_list = []
+    split_x_index = [ele + 1 for ele in index_list]
+    split_x_index.insert(0, 0)
+    split_x_index.insert(-1, len(all_small_coordinate_raw))
+    split_x_index = sorted(list(set(split_x_index)))
+    for i, split in enumerate(split_x_index[1:]):
+        one_line = all_small_coordinate_raw[split_x_index[i]:split_x_index[i + 1]]
+        one_line = sorted(one_line, key=lambda k: k[0])
+        res_list.append(one_line)
+    return res_list
+
+
+def get_one_col_box(all_small_coordinate_list, width):
+    all_small_coordinate_raw = sorted(all_small_coordinate_list, key=lambda k: k[0])
+    all_small_coordinate_raw_array = np.array(all_small_coordinate_raw)
+
+    pre = all_small_coordinate_raw_array[1:, 0]
+    rear = all_small_coordinate_raw_array[:-1, 0]
+    y_diff = rear - pre
+    index_list = [index for index, ele in enumerate(y_diff) if ele < 0 and abs(ele) > width // 3]
+
+    res_list = []
+    split_x_index = [ele + 1 for ele in index_list]
+    split_x_index.insert(0, 0)
+    split_x_index.insert(-1, len(all_small_coordinate_raw))
+    split_x_index = sorted(list(set(split_x_index)))
+    for i, split in enumerate(split_x_index[1:]):
+        one_line = all_small_coordinate_raw[split_x_index[i]:split_x_index[i + 1]]
+        one_line = sorted(one_line, key=lambda k: k[0])
+        res_list.append(one_line)
+    return res_list
+
+
+def analysis_s_box(choice_m_bbox_list):
+    choice_m_box_dict = []
+    for index, s_choice_m_dict in enumerate(choice_m_bbox_list):
+        all_small_coordinate_list0 = [[ele['xmin'], ele['ymin'], ele['xmax'], ele['ymax']] for ele in s_choice_m_dict['all_small_coordinate']]
+        all_small_coordinate_raw0 = sorted(all_small_coordinate_list0, key=lambda k: k[1])
+        all_small_coordinate_raw_array0 = np.array(all_small_coordinate_raw0)
+        s_box_wid_hei = (
+            int(np.mean(all_small_coordinate_raw_array0[:, 2])) - int(np.mean(all_small_coordinate_raw_array0[:, 0])),
+            int(np.mean(all_small_coordinate_raw_array0[:, 3])) - int(np.mean(all_small_coordinate_raw_array0[:, 1])))
+
+        all_small_coordinate_list = move_intersect_box(all_small_coordinate_list0)
+
+        all_small_coordinate_raw = sorted(all_small_coordinate_list, key=lambda k: k[1])
+        all_small_coordinate_raw_array = np.array(all_small_coordinate_raw)
+
+        pre = all_small_coordinate_raw_array[1:, 1]
+        rear = all_small_coordinate_raw_array[:-1, 1]
+        y_diff = rear - pre
+        index_list = [index for index, ele in enumerate(y_diff) if ele < 0 and abs(ele) > s_box_wid_hei[1]//3]
+
+        res_list = []
+        split_x_index = [ele + 1 for ele in index_list]
+        split_x_index.insert(0, 0)
+        split_x_index.insert(-1, len(all_small_coordinate_raw))
+        split_x_index = sorted(list(set(split_x_index)))
+        for i, split in enumerate(split_x_index[1:]):
+            one_line = all_small_coordinate_raw[split_x_index[i]:split_x_index[i + 1]]
+            one_line = sorted(one_line, key=lambda k: k[0])
+            res_list.append(one_line)
+
+        one_line_list = []
+        for index1, ele1 in enumerate(res_list):
+            if len(ele1) == s_choice_m_dict['cols']:
+                one_line_list.append(ele1)
+        s_box_all = [ele0 for ele in one_line_list for ele0 in ele]
+        if s_box_all == []:
+            x_y_interval = int((s_choice_m_dict['single_width'] * 2) // 3)
+        else:
+            x_y_interval = []
+            if len(one_line_list) > 1:
+                x_y_interval = get_x_diff_and_y_diff0(s_box_all)
+            elif len(one_line_list) == 1:
+                s_box_arr = np.array(s_box_all)
+                rear = s_box_arr[1:, 0]
+                pre = s_box_arr[:-1, 2]
+                x_y_interval = int(np.mean(rear - pre))
+            elif len(one_line_list) == []:
+                x_y_interval = get_x_diff_and_y_diff0(all_small_coordinate_raw)
+
+        all_small_coordinate = []
+        res_list = sorted(res_list, key=lambda k: k[0])
+        for index, box_list in enumerate(res_list):
+            for s_box in box_list:
+                all_small_coordinate.append(s_box)
+            if len(box_list) == s_choice_m_dict['cols']:
+                continue
+            else:
+                one_line_xmin = [ele[0] for ele in box_list]
+                choice_m_xmin = s_choice_m_dict['bounding_box']['xmin']
+                exist_index_all = []
+                for exist_index, exist_xmin in enumerate(one_line_xmin):
+                    if abs(choice_m_xmin - exist_xmin) <= s_box_wid_hei[0]:
+                        exist_index_all.append(0)
+                    else:
+                        k = round(abs(choice_m_xmin - exist_xmin) / (s_box_wid_hei[0] + x_y_interval))
+                        exist_index_all.append(k)
+
+                s_box_index_all = [ele for ele in range(s_choice_m_dict['cols'])]
+                lack_index = [ele for ele in s_box_index_all if ele not in exist_index_all]
+                if 0 in exist_index_all:
+                    for lack_ele in lack_index:
+                        xmin = box_list[0][0] + lack_ele * (s_box_wid_hei[0] + x_y_interval)
+                        ymin = box_list[0][1]
+                        xmax = xmin + s_box_wid_hei[0]
+                        ymax = box_list[0][3]
+                        all_small_coordinate.append([xmin, ymin, xmax, ymax])
+                elif (s_choice_m_dict['cols'] - 1) in exist_index_all:
+                    for lack_ele in lack_index:
+                        xmin = box_list[-1][0] - (s_choice_m_dict['cols'] - 1 - lack_ele) * (s_box_wid_hei[0] + x_y_interval)
+                        ymin = box_list[-1][1]
+                        xmax = xmin + s_box_wid_hei[0]
+                        ymax = box_list[-1][3]
+                        all_small_coordinate.append([xmin, ymin, xmax, ymax])
+                elif 1 in exist_index_all:
+                    for lack_ele in lack_index:
+                        if lack_ele < 1:
+                            xmin = box_list[0][0] - (s_box_wid_hei[0] + x_y_interval)
+                            ymin = box_list[0][1]
+                            xmax = xmin + s_box_wid_hei[0]
+                            ymax = box_list[0][3]
+                            all_small_coordinate.append([xmin, ymin, xmax, ymax])
+                        else:
+                            xmin = box_list[0][0] + (lack_ele - 1) * (s_box_wid_hei[0] + x_y_interval)
+                            ymin = box_list[0][1]
+                            xmax = xmin + s_box_wid_hei[0]
+                            ymax = box_list[0][3]
+                            all_small_coordinate.append([xmin, ymin, xmax, ymax])
+        all_small_coordinate0 = []
+        for s_bbox in all_small_coordinate:
+            location = {}
+            location['xmin'] = s_bbox[0]
+            location['ymin'] = s_bbox[1]
+            location['xmax'] = s_bbox[2]
+            location['ymax'] = s_bbox[3]
+            all_small_coordinate0.append(location)
+        s_choice_m_dict.update({'all_small_coordinate': all_small_coordinate0})
+        choice_m_box_dict.append(s_choice_m_dict)
+    return choice_m_box_dict
+
+
+def get_title_number(choice_bbox, choice_region, choice_m_box_dict, direction):
+    words_result_choice = get_ocr_text_and_coordinate0(choice_region, ocr_accuracy='accurate', language_type='CHN_ENG')
+    all_digital_list0 = []
+    pattern = re.compile(r'\d')
+    for i, chars_dict in enumerate(words_result_choice):
+        chars_list = chars_dict['chars']
+        for ele in chars_list:
+            if pattern.search(ele['char']):
+                all_digital_list0.append(ele)
+
+    # tree = ET.parse(r'C:\Users\admin\Desktop\exam_segment_django113\segment\exam_info\000000-template.xml')  # xml tree
+    # for index, bbox in enumerate(all_digital_list0):
+    #     # bbox0 = region_info['bbox']
+    #     location = bbox['location']
+    #     xmin = location['left']
+    #     ymin = location['top']
+    #     xmax = location['left'] + location['width']
+    #     ymax = location['top'] + location['height']
+    #     tree = utils.create_xml(bbox['char'], tree, xmin, ymin, xmax, ymax)
+    # tree.write(r'C:\Users\admin\Desktop\exam_segment_django113\segment\exam_image\sheet\arts_comprehensive\2020-02-05\choice_region_00.xml')
+
+    delete_list = []
+    for ele_digtal in all_digital_list0:
+        for ele_choice_m in choice_m_box_dict:
+            xmin_d = ele_digtal['location']['left']
+            ymin_d = ele_digtal['location']['top']
+            xmax_d = ele_digtal['location']['left'] + ele_digtal['location']['width']
+            ymax_d = ele_digtal['location']['top'] + ele_digtal['location']['height']
+
+            ele_digtal_bbox = [xmin_d, ymin_d, xmax_d, ymax_d]
+
+            ele_choice_m_bbox = [ele_choice_m['bounding_box']['xmin'], ele_choice_m['bounding_box']['ymin'],
+                                 ele_choice_m['bounding_box']['xmax'], ele_choice_m['bounding_box']['ymax']]
+
+            choice_m_new_box = utils.get_img_region_box1(ele_choice_m_bbox, choice_bbox)
+            if utils.decide_coordinate_full_contains2(choice_m_new_box, ele_digtal_bbox) == True:
+                delete_list.append(ele_digtal)
+
+    all_digital_list = []
+
+    for ele in all_digital_list0:
+        if ele in delete_list:
+            continue
+        else:
+            all_digital_list.append(ele)
+
+    # new_all_digital_list = combine_char(all_digital_list)
+    #
+    # tree = ET.parse(r'C:\Users\admin\Desktop\exam_segment_django113\segment\exam_info\000000-template.xml')  # xml tree
+    # for index, bbox in enumerate(new_all_digital_list):
+    #     # bbox0 = region_info['bbox']
+    #     location = bbox['location']
+    #     xmin = location['left']
+    #     ymin = location['top']
+    #     xmax = location['left'] + location['width']
+    #     ymax = location['top'] + location['height']
+    #     tree = utils.create_xml(bbox['char'], tree, xmin, ymin, xmax, ymax)
+    # tree.write(r'C:\Users\admin\Desktop\exam_segment_django113\segment\exam_image\sheet\arts_comprehensive\2020-02-05\choice_region_0.xml')
+
+    choice_m_box_dict_new = []
+    x_y_interval_all = []
+    s_box_w_h = []
+    for index, s_choice_m_box in enumerate(choice_m_box_dict):
+        choice_m_box = [s_choice_m_box['bounding_box']['xmin'], s_choice_m_box['bounding_box']['ymin'],
+                        s_choice_m_box['bounding_box']['xmax'], s_choice_m_box['bounding_box']['ymax']]
+        choice_m_new_box = utils.get_img_region_box1(choice_m_box, choice_bbox)
+        all_small_coordinate_dict = s_choice_m_box['all_small_coordinate']
+        all_small_coordinate_list = [[ele['xmin'], ele['ymin'], ele['xmax'], ele['ymax']] for ele in
+                                     all_small_coordinate_dict]
+        all_small_coordinate_new = []
+        for s_bbox in all_small_coordinate_list:
+            s_bbox_new = utils.get_img_region_box1(s_bbox, choice_bbox)
+            all_small_coordinate_new.append(s_bbox_new)
+        col = s_choice_m_box['cols']
+        x_y_interval = utils.get_x_diff_and_y_diff1(all_small_coordinate_new, col)
+        x_y_interval_all.append(x_y_interval)
+
+        all_small_coordinate_list = sorted(all_small_coordinate_new, key=lambda k: k[1])
+        s_box_array = np.array(all_small_coordinate_list)
+        s_box_wid_hei = (int(np.mean(s_box_array[:, 2])) - int(np.mean(s_box_array[:, 0])),
+                         int(np.mean(s_box_array[:, 3])) - int(np.mean(s_box_array[:, 1])))
+        s_box_w_h.append(s_box_wid_hei)
+        s_choice_m_box.update({'bounding_box': choice_m_new_box,
+                               'all_small_coordinate': all_small_coordinate_new,
+                               's_box_w_h': s_box_wid_hei,
+                               'x_y_interval': x_y_interval})
+
+        choice_m_box_dict_new.append(s_choice_m_box)
+
+    x_y_interval_arr = np.array(x_y_interval_all)
+    x_y_interval_ave = (int(np.mean(x_y_interval_arr[:, 0])), int(np.mean(x_y_interval_arr[:, 1])))
+
+    s_box_w_h_arr = np.array(s_box_w_h)
+    singe_box_width_height_ave = (int(np.mean(s_box_w_h_arr[:, 0])), int(np.mean(s_box_w_h_arr[:, 1])))
+
+    digital_list_by_choice_m = get_digital_near_choice_m_box(all_digital_list0, choice_m_box_dict_new, x_y_interval_ave, singe_box_width_height_ave, direction)
+
+    for number in digital_list_by_choice_m:
+        title_number_list = number['title_number']
+
+        all_digital_list = sorted(title_number_list, key=lambda k: k.get('location')['top'])
+        new_title_number_list = combine_char(all_digital_list)
+        number.update({'title_number': new_title_number_list})
+
+    # tree = ET.parse(r'C:\Users\admin\Desktop\exam_segment_django113\segment\exam_info\000000-template.xml')  # xml tree
+    # for index, bbox0 in enumerate(digital_list_by_choice_m):
+    #     title_number_list = bbox0['title_number']
+    #     for bbox in title_number_list:
+    #         location = bbox['location']
+    #         xmin = location['left']
+    #         ymin = location['top']
+    #         xmax = location['left'] + location['width']
+    #         ymax = location['top'] + location['height']
+    #         tree = utils.create_xml(bbox['char'], tree, xmin, ymin, xmax, ymax)
+    # tree.write(r'C:\Users\admin\Desktop\exam_segment_django113\segment\exam_image\sheet\arts_comprehensive\2020-02-05\choice_region_0.xml')
+
+
+    digital_list_by_choice_m = sorted(digital_list_by_choice_m, key=lambda k: k.get('bounding_box')[1])
+    digital_list_by_choice_m = sorted(digital_list_by_choice_m, key=lambda k: k.get('bounding_box')[0])
+
+    all_list = []
+    for index0, ele0 in enumerate(choice_m_box_dict_new):
+        for index1, ele1 in enumerate(digital_list_by_choice_m):
+            choice_m0 = ele0['bounding_box']
+            choice_m1 = ele1['bounding_box']
+            if choice_m0 == choice_m1:
+                ele0.update({'title_number': ele1['title_number']})
+                all_list.append(ele0)
+
+    all_list_new = []
+    a_z = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+    for choice_m_index, choice_m_s in enumerate(all_list):
+        # if direction == 180:     # 题号vertical
+        row_and_col = (choice_m_s['rows'], choice_m_s['cols'])
+        title_number = choice_m_s['title_number']
+        x_y_interval = choice_m_s['x_y_interval']
+        single_bbox_width_height = choice_m_s['s_box_w_h']
+        s_box = choice_m_s['all_small_coordinate']
+        if len(title_number) == row_and_col[0]:
+            title_number_list = [int(ele['char']) for ele in title_number]
+            choice_m_s.update({'title_number': title_number_list})
+            all_list_new.append(choice_m_s)
+        else:
+            s_box = sorted(s_box, key=lambda k: k[1])
+            s_box_row = get_one_line_box(s_box, single_bbox_width_height[1])
+            row_box_ = []
+
+            index_list = []  # 存在的index
+            for index0, ele0 in enumerate(s_box_row):
+                for index1, ele1 in enumerate(title_number):
+                    title_number_bbox = [ele1['location']['left'], ele1['location']['top'],
+                                         ele1['location']['left'] + ele1['location']['width'],
+                                         ele1['location']['top'] + ele1['location']['height']]
+                    row_box = {}
+                    if title_number_bbox[1] - single_bbox_width_height[1] < ele0[0][1] < title_number_bbox[1] + \
+                            single_bbox_width_height[1] \
+                            and title_number_bbox[3] - single_bbox_width_height[1] < ele0[0][3] < \
+                            title_number_bbox[3] + single_bbox_width_height[1]:
+                        row_box['title_number'] = ele1
+                        row_box['row_box'] = ele0
+                        row_box_.append(row_box)
+                        index_list.append(index0)
+            index0 = list(set([i for i in range(0, row_and_col[0])]) - set(index_list))  # lack index
+            number0 = choice_m_s['number']
+            for index, exist_index in enumerate(index_list):
+                number_char = choice_m_s['title_number'][index]['char']
+                number0[exist_index] = int(number_char)
+            new_number_list = utils.infer_number(number0)
+            choice_m_s.update({'title_number': new_number_list})
+            all_list_new.append(choice_m_s)
+    # print(all_list_new)
+    title_number_by_choice_m_list = []     # sort change coordinate
+    for index, single_choice_m in enumerate(all_list_new):
+        small_bbox_list = []
+        for index_s, ele_s in enumerate(single_choice_m['all_small_coordinate']):
+            location_s = {}
+            s_box_new = utils.get_img_region_box0(ele_s, choice_bbox)
+            location_s['xmin'] = s_box_new[0]
+            location_s['ymin'] = s_box_new[1]
+            location_s['xmax'] = s_box_new[2]
+            location_s['ymax'] = s_box_new[3]
+            small_bbox_list.append(location_s)
+
+        choice_m_bbox = single_choice_m['bounding_box']
+        choice_m_by_img = utils.get_img_region_box0(choice_m_bbox, choice_bbox)
+        choice_m_by_img0 = utils.list_to_dict(choice_m_by_img)
+        single_choice_m.update({'number': single_choice_m['title_number']})
+        single_choice_m.update({'bounding_box': choice_m_by_img0,
+                                'all_small_coordinate': small_bbox_list})
+        single_choice_m.pop('x_y_interval')
+        single_choice_m.pop('s_box_w_h')
+        single_choice_m.pop('title_number')
+        title_number_by_choice_m_list.append(single_choice_m)
+    return title_number_by_choice_m_list

+ 3 - 0
segment/sheet_resolve/analysis/cloze/__init__.py

@@ -0,0 +1,3 @@
+# @Author  : lightXu
+# @File    : __init__.py.py
+# @Time    : 2018/11/21 0021 下午 16:02

+ 101 - 0
segment/sheet_resolve/analysis/cloze/analysis_cloze.py

@@ -0,0 +1,101 @@
+# @Author  : lightXu
+# @File    : analysis_cloze.py
+import time
+
+import numpy as np
+import cv2
+
+from segment.sheet_resolve.lib.model.test import im_detect
+from segment.sheet_resolve.lib.model.nms_wrapper import nms
+from segment.sheet_resolve.lib.utils.timer import Timer
+from segment.sheet_resolve.tools import utils
+
+
+def analysis_single_image_with_regions(analysis_type, classes, sess, net,
+                                       im, conf_thresh, mns_thresh,
+                                       coordinate_bias_dict):
+    """Detect object classes in an image using pre-computed object proposals."""
+
+    size = im.shape
+
+    # Detect all object classes and regress object bounds
+    timer = Timer()
+    timer.tic()
+    im, ratio = utils.img_resize(analysis_type, im)
+    scores, boxes = im_detect(analysis_type, sess, net, im)
+    timer.toc()
+    print('Detection took {:.3f}s for {:d} object proposals'.format(timer.total_time, boxes.shape[0]))
+
+    content_list = []
+    analysis_cls_list = []
+    for cls_ind, cls in enumerate(classes[1:]):  # classes
+        cls_ind += 1  # because we skipped background
+        cls_boxes = boxes[:, 4 * cls_ind:4 * (cls_ind + 1)]
+        cls_scores = scores[:, cls_ind]
+        dets = np.hstack((cls_boxes,
+                          cls_scores[:, np.newaxis])).astype(np.float32)
+        keep = nms(dets, mns_thresh)
+        dets = dets[keep, :]
+        # vis_detections(im, cls, dets, ax, thresh=conf_thresh)
+        inds = np.where(dets[:, -1] >= conf_thresh)[0]
+        if len(inds) > 0:
+            if cls in list(coordinate_bias_dict.keys()):
+                xmin_bias = coordinate_bias_dict[cls]['xmin_bias']
+                ymin_bias = coordinate_bias_dict[cls]['ymin_bias']
+                xmax_bias = coordinate_bias_dict[cls]['xmax_bias']
+                ymax_bias = coordinate_bias_dict[cls]['ymax_bias']
+            else:
+                xmin_bias = 0
+                ymin_bias = 0
+                xmax_bias = 0
+                ymax_bias = 0
+            for i in inds:
+                bbox = dets[i, :4]
+                score = '{:.4f}'.format(dets[i, -1])
+
+                xmin = int(int(bbox[0]) * ratio[0]) + xmin_bias
+                ymin = int(int(bbox[1]) * ratio[1]) + ymin_bias
+                xmax = int(int(bbox[2]) * ratio[0]) + xmax_bias
+                ymax = int(int(bbox[3]) * ratio[1]) + ymax_bias
+
+                if xmin_bias - xmax_bias >= xmax - xmin:
+                    print('{:s}, xmin_bias - xmax_bias >= region_width'.format(cls))
+                    continue
+                if ymin_bias - ymax_bias >= ymax - ymin:
+                    print('{:s}, ymin_bias - ymax_bias >= region_width'.format(cls))
+                    continue
+
+                # xmin >=1, ymin>=1, xmax <= size[0] - 1, ymax <= size[1] - 1
+                xmin = (xmin if (xmin > 0) else 1)
+                ymin = (ymin if (ymin > 0) else 1)
+                xmax = (xmax if (xmax < size[1]) else size[1] - 1)
+                ymax = (ymax if (ymax < size[0]) else size[0] - 1)
+
+                bbox_dict = {"xmin": xmin, "ymin": ymin, "xmax": xmax, "ymax": ymax}
+                class_dict = {"class_name": cls, "bounding_box": bbox_dict, "score": score}
+                content_list.append(class_dict)
+
+                analysis_cls_list.append(cls)
+
+    return content_list, sorted(analysis_cls_list)
+
+
+def get_single_image_sheet_regions(analysis_type, im, classes,
+                                   sess, net, conf_thresh, mns_thresh,
+                                   coordinate_bias_dict):
+    start_time = time.time()
+
+    content, cls = analysis_single_image_with_regions(analysis_type, classes,
+                                                      sess, net,
+                                                      im, conf_thresh, mns_thresh,
+                                                      coordinate_bias_dict)
+
+    img_dict = {"img_name": 'cloze',
+                'analysis_type': analysis_type,
+                "regions": content,
+                }
+
+    end_time = time.time()
+    print(end_time - start_time)
+
+    return img_dict

+ 146 - 0
segment/sheet_resolve/analysis/cloze/cloze_box.py

@@ -0,0 +1,146 @@
+# @Author  : lightXu
+# @File    : cloze_box.py
+# @Time    : 2018/11/23 0023 上午 10:47
+import re
+import xml.etree.cElementTree as ET
+import time
+from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate
+from segment.sheet_resolve.tools import tf_settings, utils
+
+
+def get_cloze_box_coordinate(solve_img):
+    ocr_region = solve_img
+    t11 = time.time()
+    word_result_list = get_ocr_text_and_coordinate(ocr_region)
+    t22 = time.time()
+    print('choice ocr time cost: ', t22-t11)
+    return word_result_list
+
+
+def get_each_coordinate(solve_img):
+
+    word_result_list = get_cloze_box_coordinate(solve_img)
+    all_char_list = []
+    digital_model = re.compile(r'\d')
+    for i, chars_dict in enumerate(word_result_list):
+        chars_list = chars_dict['chars']
+        for ele in chars_list:
+            if digital_model.search(ele['char']):
+                all_char_list.append(ele)
+
+    return all_char_list
+
+
+def decide_coordinate_contains(solve_img, xml_box):
+    ocr_box = get_each_coordinate(solve_img)
+
+    ocr_all_char_list = []
+    i = 1
+    while i <= len(ocr_box):
+        pre_one = ocr_box[i - 1]
+        if i == len(ocr_box):
+            ocr_all_char_list.append(pre_one)
+            break
+        rear_one = ocr_box[i]
+        # 两字高度差小于一字高度
+        condition1 = abs(pre_one['location']['top'] - rear_one['location']['top']) < pre_one['location']['height']
+        # 两字长度大于两字间间隔
+        condition2 = abs(pre_one['location']['left'] + pre_one['location']['width']
+                         - rear_one['location']['left']) < pre_one['location']['width']
+        if condition1:
+            if condition2:
+                new_char = pre_one['char'] + rear_one['char']
+                new_location = {'left': pre_one['location']['left'],
+                                'top': min(pre_one['location']['top'], rear_one['location']['top']),
+                                'width': rear_one['location']['left'] + rear_one['location']['width']
+                                - pre_one['location']['left'],
+                                'height': max(pre_one['location']['height'], rear_one['location']['height'])}
+                ocr_all_char_list.append({'char': new_char, 'location': new_location})
+                i = i + 1 + 1
+            else:
+                ocr_all_char_list.append(pre_one)
+                i = i + 1
+        else:
+            ocr_all_char_list.append(pre_one)  # 遇到字符y轴相差过大就结束
+            i = i + 1
+
+    content_list = []
+    for index, xml_b in enumerate(xml_box):  # faster-rcnn 的框
+        bbox_right = []
+        xml_b = xml_b['bounding_box']
+        xmin2 = xml_b['xmin']
+        ymin2 = xml_b['ymin']
+        xmax2 = xml_b['xmax']
+        ymax2 = xml_b['ymax']
+        mid_x2 = int(xmin2 + (xmax2 - xmin2) // 2)
+        box_coordiante = (xmin2, ymin2, xmax2, ymax2)
+        choice_number = {'number': 999, 'location': box_coordiante}
+        content_list.insert(index, choice_number)
+
+        fixed_height = 60  # 高度固定
+        for ocr_b in ocr_all_char_list:  # ocr识别的框
+            xmin1 = ocr_b['location']['left']
+            ymin1 = ocr_b['location']['top']
+            xmax1 = xmin1 + ocr_b['location']['width']
+            ymax1 = ymin1 + ocr_b['location']['height']
+            mid_x = int(xmin1 + (xmax1 - xmin1) // 2)
+            mid_y = int(ymin1 + (ymax1 - ymin1) // 2)
+
+            if xmin2 <= mid_x <= xmax2 and ymin2 <= mid_y <= ymax2 and mid_x < mid_x2:  # 包含且在左侧
+                content_list[index]['number'] = ocr_b['char']
+
+            if xmin2 <= mid_x <= xmax2 and ymin2 <= mid_y <= ymax2 and mid_x >= mid_x2:  # 包含且在右侧
+                ocr_b['location']['left'] = int(xmin1 - 1 * ocr_b['location']['width'])  # 打分框的边框
+                ocr_b['location']['width'] = 3 * ocr_b['location']['width']
+                ocr_b['location']['top'] = int(ymin1 - 0.5 * ocr_b['location']['height'])
+
+                xmin = ocr_b['location']['left']
+                xmax = xmin + ocr_b['location']['width']
+                ymin = ocr_b['location']['top']
+                ymax = ymin + fixed_height
+                bbox_right.append({'points': ocr_b['char'], 'location': [xmin, ymin, xmax, ymax]})
+
+        ymin = min([ele['location'][1] for ele in bbox_right])
+        ymax = max([ele['location'][3] for ele in bbox_right])
+        for ele in bbox_right:
+            ele['location'][1] = ymin
+            ele['location'][3] = ymax
+
+        content_list[index]['right'] = bbox_right
+
+    return content_list
+
+
+def cloze(left, top, image, choice_bbox_list, xml_path):
+
+    content = decide_coordinate_contains(image, choice_bbox_list)
+
+    tree = ET.parse(xml_path)  # xml tree
+    cloze_list = []
+    for ele in content:
+        number = '{:02d}_cloze'.format(int(ele['number']))
+        cloze_xmin = ele['location'][0] + left
+        cloze_ymin = ele['location'][1] + top
+        cloze_xmax = ele['location'][2] + left
+        cloze_ymax = ele['location'][3] + top
+
+        tree = utils.create_xml(number, tree, cloze_xmin, cloze_ymin, cloze_xmax, cloze_ymax)
+        region = [cloze_xmin, cloze_ymin, cloze_xmax, cloze_ymax]
+
+        points_list = []
+        if len(ele['right']) > 0:  # 存在打分框
+            for right_ele in ele['right']:
+                points = right_ele['points']
+                xmin = right_ele['location'][0] + left
+                ymin = right_ele['location'][1] + top
+                xmax = right_ele['location'][2] + left
+                ymax = right_ele['location'][3] + top
+                point_number = '{}_{}'.format(number, points)
+
+                tree = utils.create_xml(point_number, tree, xmin, ymin, xmax, ymax)
+                region = [xmin, ymin, xmax, ymax]
+                points_list.append({'points': point_number, 'region': region})
+
+        cloze_list.append({'number': number, 'region': region, 'points': points_list})
+    tree.write(xml_path)
+    return cloze_list

+ 117 - 0
segment/sheet_resolve/analysis/cloze/cloze_line_box.py

@@ -0,0 +1,117 @@
+# @Author  : lightXu
+# @File    : cloze_line_box.py
+# @Time    : 2019/2/21 0021 上午 11:13
+
+import os
+import re
+import cv2
+import numpy as np
+import xml.etree.cElementTree as ET
+import time
+from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate
+from segment.sheet_resolve.tools import tf_settings, utils
+
+
+def get_cloze_box_coordinate(solve_img):
+    ocr_region = solve_img
+    t11 = time.time()
+    word_result_list = get_ocr_text_and_coordinate(ocr_region)
+    t22 = time.time()
+    print('cloze ocr time cost: ', t22-t11)
+    return word_result_list
+
+
+def get_each_coordinate(solve_img):
+    word_result_list = get_cloze_box_coordinate(solve_img)
+    all_char_list = []
+    digital_model = re.compile(r'\d')
+    for i, chars_dict in enumerate(word_result_list):
+        chars_list = chars_dict['chars']
+        for ele in chars_list:
+            if digital_model.search(ele['char']):
+                all_char_list.append(ele)
+
+    return all_char_list
+
+
+def decide_coordinate_contains(solve_img, xml_box):
+    ocr_box = get_each_coordinate(solve_img)
+
+    ocr_all_char_list = []
+    i = 1
+    while i <= len(ocr_box):
+        pre_one = ocr_box[i - 1]
+        if i == len(ocr_box):
+            ocr_all_char_list.append(pre_one)
+            break
+        rear_one = ocr_box[i]
+        # 两字高度差小于一字高度
+        condition1 = abs(pre_one['location']['top'] - rear_one['location']['top']) < pre_one['location']['height']
+        # 两字长度大于两字间间隔
+        condition2 = abs(pre_one['location']['left'] + pre_one['location']['width']
+                         - rear_one['location']['left']) < pre_one['location']['width']
+        if condition1:
+            if condition2:
+                new_char = pre_one['char'] + rear_one['char']
+                new_location = {'left': pre_one['location']['left'],
+                                'top': min(pre_one['location']['top'], rear_one['location']['top']),
+                                'width': rear_one['location']['left'] + rear_one['location']['width']
+                                - pre_one['location']['left'],
+                                'height': max(pre_one['location']['height'], rear_one['location']['height'])}
+                ocr_all_char_list.append({'char': new_char, 'location': new_location})
+                i = i + 1 + 1
+            else:
+                ocr_all_char_list.append(pre_one)
+                i = i + 1
+        else:
+            ocr_all_char_list.append(pre_one)  # 遇到字符y轴相差过大就结束
+            i = i + 1
+
+    content_list = []
+    for index, xml_b in enumerate(xml_box):  # faster-rcnn 的框
+        bbox_right = []
+        xml_b = xml_b['bounding_box']
+        xmin2 = xml_b['xmin']
+        ymin2 = xml_b['ymin']
+        xmax2 = xml_b['xmax']
+        ymax2 = xml_b['ymax']
+        mid_x2 = int(xmin2 + (xmax2 - xmin2) // 2)
+        box_coordiante = (xmin2, ymin2, xmax2, ymax2)
+        choice_number = {'number': 999, 'location': box_coordiante}
+        content_list.insert(index, choice_number)
+
+        fixed_height = 60
+        for ocr_b in ocr_all_char_list:  # ocr识别的框
+            xmin1 = ocr_b['location']['left']
+            ymin1 = ocr_b['location']['top']
+            xmax1 = xmin1 + ocr_b['location']['width']
+            ymax1 = ymin1 + ocr_b['location']['height']
+            mid_x = int(xmin1 + (xmax1 - xmin1) // 2)
+            mid_y = int(ymin1 + (ymax1 - ymin1) // 2)
+
+            if xmin2 <= mid_x <= xmax2 and ymin2 <= mid_y <= ymax2 and mid_x < mid_x2:  # 包含且在左侧
+                content_list[index]['number'] = ocr_b['char']
+
+    return content_list
+
+
+def cloze_line(left, top, image, choice_bbox_list, xml_path):
+
+    content = decide_coordinate_contains(image, choice_bbox_list)
+
+    tree = ET.parse(xml_path)  # xml tree
+    cloze_list = []
+    for ele in content:
+        number = int(ele['number'])
+        cloze_xmin = ele['location'][0] + left
+        cloze_ymin = ele['location'][1] + top
+        cloze_xmax = ele['location'][2] + left
+        cloze_ymax = ele['location'][3] + top
+
+        tree = utils.create_xml(str(number), tree, cloze_xmin, cloze_ymin, cloze_xmax, cloze_ymax)
+        region = {'xmin': cloze_xmin, 'ymin': cloze_ymin, 'xmax': cloze_xmax, 'ymax': cloze_ymax}
+
+        # cloze_list.append({'number': number, 'location': region, 'default_points': 5, 'class_name': 'cloze_row_col'})
+        cloze_list.append({'number': number, 'bounding_box': region, 'default_points': 5, 'class_name': 'cloze_row_col'})
+    tree.write(xml_path)
+    return cloze_list

+ 3 - 0
segment/sheet_resolve/analysis/correct/__init__.py

@@ -0,0 +1,3 @@
+# @Author  : lightXu
+# @File    : __init__.py.py
+# @Time    : 2018/12/10 0010 上午 10:25

+ 244 - 0
segment/sheet_resolve/analysis/correct/coordinates_correct.py

@@ -0,0 +1,244 @@
+# @Author  : lightXu
+# @File    : coordinates_correct.py
+# @Time    : 2018/12/10 0010 上午 10:26
+import os
+import cv2
+import traceback
+import numpy as np
+from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate_in_google_format, tesseract_boxes_by_py
+from segment.sheet_resolve.tools.utils import read_single_img, write_single_img, crop_region, read_xml_to_json
+import glob2 as glob
+
+
+def transform(template_img_size, correcting_img, correcting_pts, template_pts):
+    pts1 = np.float32(correcting_pts)  # 原始坐标
+    pts2 = np.float32(template_pts)  # 目标坐标
+
+    M = cv2.getAffineTransform(pts1, pts2)
+    dst = cv2.warpAffine(correcting_img, M, template_img_size, borderValue=(0, 0, 255))
+    return dst
+
+
+def get_same_str(str1, str2):
+    str1_set = set(str1)
+    str2_set = set(str2)
+    intersection = str1_set & str2_set
+    if intersection:
+        len1 = len(str1)
+        len2 = len(str2)
+        if len2 == 1:
+            start_index = str1.index(str2)
+            return {'ismatch': True, 'coordinates': ((start_index, start_index + 1), (0, 1))}
+        else:
+            str_set = set(str1 + str2)
+            str_set_dict = {}
+            for i, ele in enumerate(sorted(list(str_set))):
+                str_set_dict[ele] = i + 1
+
+            str1_np = np.asarray([str_set_dict[k] for k in str1])
+            str2_np = np.asarray([str_set_dict[k] for k in str2])
+
+            np1 = np.tile(str1_np, (len2, 1))
+            np2 = np.tile(str2_np, (1, len1)).reshape(len1, len2).T
+
+            np3 = np1 - np2
+
+            size = np3.shape
+
+            np4 = np3.reshape(-1, 1)
+
+            np4_list = np4.tolist()
+            zero_list = list()
+            str_index = []
+            for i, ele in enumerate(np4):
+                if i in zero_list:
+                    continue
+                else:
+                    if ele == [0]:
+                        length = 0
+                        zero_list.append(i)
+                        for interval in range(1, size[1]):
+                            next_index = i + size[1] * interval + interval
+                            if next_index < len(np4_list):
+                                if np4_list[next_index] == [0]:
+                                    length += 1
+                                    zero_list.append(next_index)  # 跳过的循环
+                                if np4_list[next_index] != [0]:
+                                    break
+                        str_index.append((i, i + size[1] * length + length, length))
+                    else:
+                        pass
+
+            # print(str_index)
+            # print(max_index[0])
+            max_index = sorted(str_index, key=lambda k: k[2], reverse=True)
+            a = (max_index[0][0] // size[1], max_index[0][0] % size[1])
+            b = (max_index[0][1] // size[1], max_index[0][1] % size[1])
+
+            max_str1 = str1[a[1]:b[1] + 1]
+            max_str2 = str2[a[0]:b[0] + 1]
+            # print(max_str1, max_str2)
+
+            return {'ismatch': True, 'coordinates': ((a[1], b[1] + 1), (a[0], b[0] + 1))}
+
+    else:
+        return {'ismatch': False, 'coordinates': ()}
+
+
+def match_string(correcting_ocr, correcting_bias, template_ocr, template_bias):
+    correcting_words_list = correcting_ocr['chars']
+    template_words_list = template_ocr['chars']
+
+    max_same_str = ''
+    index_pair = {}
+
+    longer = ''.join(correcting_words_list)
+    shorter = ''.join(template_words_list)
+
+    res = get_same_str(longer, shorter)
+    if res['ismatch']:
+        c, t = res['coordinates']
+        if len(max_same_str) < c[1]-c[0]:
+            max_same_str = longer[c[0]:c[1]]
+            index_pair['correcting'] = (c[0], c[1])
+            index_pair['template'] = (t[0], t[1])
+
+    if len(index_pair) > 0:
+        correcting_coordinate = correcting_ocr['coordinates'][index_pair['correcting'][0]]  # xmin, ymin, xmax, ymax
+        c_x, c_y = correcting_bias[0] + correcting_coordinate[0], correcting_bias[1] + correcting_coordinate[1]  # 取左上角的坐标
+
+        template_coordinate = template_ocr['coordinates'][index_pair['template'][0]]
+        t_x, t_y = template_bias[0] + template_coordinate[0], template_bias[1] + template_coordinate[1]
+
+        print(max_same_str)
+        if abs(c_x-t_x) < 50 and abs(c_y-t_y) < 50:
+            return {'correcting': (c_x, c_y), 'template': (t_x, t_y)}
+        else:
+            return {}
+    else:
+        return {}
+
+
+def get_template(template_img, ocr_classes_dict, method='google'):
+    template_dict = {}
+
+    for ocr in ocr_classes_dict:
+        template = {}
+        class_name = ocr['class_name']
+        if 'solve' in class_name:
+            # ocr['region']['ymax'] = int(0.10 * (ocr['region']['ymax']-ocr['region']['ymin']) +ocr['region']['ymin'])
+            ocr['region']['ymax'] = int(250 + ocr['region']['ymin'])
+        ocr_box = ocr['region']
+        left, top = ocr_box['xmin'], ocr_box['ymin']
+
+        ocr_img = crop_region(template_img, ocr_box)
+        # cv2.imshow(class_name, ocr_img)
+        # if cv2.waitKey(0) == 27:
+        #     cv2.destroyAllWindows()
+
+        if method == 'baidu':
+            ocr_word = get_ocr_text_and_coordinate_in_google_format(ocr_img)  # baidu
+        else:
+            ocr_word = tesseract_boxes_by_py(ocr_img)  # tesseract
+
+        template['words_result'] = ocr_word
+        template['coordinate_bias'] = (left, top)
+
+        template_dict[class_name] = template
+    return template_dict
+
+
+def get_correct_points(points_list, size):
+    if len(points_list) > size >= 3:
+        points_list = sorted(points_list, key=lambda k: k[1])
+        i = 1
+        choice_index_list = [0]
+        for index, ele in enumerate(points_list):
+            if abs(ele[1] - points_list[0][1]) > 500:  # 找y轴差值大于500的坐标
+                choice_index_list.append(index)
+                i = i + 1
+                if i == size:
+                    break
+        return choice_index_list
+    elif len(points_list) == 3:
+        return [0, 1, 2]
+    else:
+        raise Exception
+
+
+def save_transformed_img(template_img_size, images_path_list, ocr_classes_dict, correcting_img_dir_path, corrected_img_save_dir, template_dict, method='google'):
+    for img_path in images_path_list:
+        correcting_img = read_single_img(img_path)
+        print('***********************************')
+        print(img_path)
+        correct_coordinates_list = []
+        template_coordinates_list = []
+        try:
+            for ocr in ocr_classes_dict:
+                class_name = ocr['class_name']
+                if 'solve' in class_name:
+                    ocr['region']['ymax'] = ocr['region']['ymax']
+                ocr_box = ocr['region']
+                left, top = ocr_box['xmin'], ocr_box['ymin']
+
+                correcting_oct_region = crop_region(correcting_img, ocr_box)
+                # cv2.imwrite(class_name+'.jpg', correcting_oct_region)
+                # cv2.imshow(class_name, correcting_oct_region)
+                # if cv2.waitKey(0) == 27:
+                #     cv2.destroyAllWindows()
+                if method == 'baidu':
+                    correcting_word = get_ocr_text_and_coordinate_in_google_format(correcting_oct_region)  # baidu
+                else:
+                    correcting_word = tesseract_boxes_by_py(correcting_oct_region)  # tesseract
+
+                template = template_dict[class_name]
+                template_word = template['words_result']
+                template_bias = template['coordinate_bias']
+
+                coordiantes_dict = match_string(correcting_word, (left, top), template_word, template_bias)
+                if len(coordiantes_dict) > 0:
+                    correct_coordinates_list.append(coordiantes_dict['correcting'])
+                    template_coordinates_list.append(coordiantes_dict['template'])
+                else:
+                    continue
+
+            choice_index_list = get_correct_points(correct_coordinates_list, 3)
+            c_coordinates = [correct_coordinates_list[ele] for ele in choice_index_list]
+            t_coordiantes = [template_coordinates_list[ele] for ele in choice_index_list]
+            dst = transform(template_img_size, correcting_img,
+                            c_coordinates, t_coordiantes)
+            print(c_coordinates, t_coordiantes)
+            save_path = img_path.replace(correcting_img_dir_path, corrected_img_save_dir)
+            write_single_img(dst, save_path)
+            print(save_path)
+        except Exception:
+            print('image corrected error')
+            traceback.print_exc()
+
+
+def correct(template_path, correcting_img_dir_path, corrected_img_save_dir, sheet_dict, method, sheet_sides='front'):
+    find_str = os.path.join(correcting_img_dir_path, '*.jpg')
+    correcting_img_path_list = glob.glob(find_str)
+    # correcting_img_path_list = [r'C:\Users\Administrator\Desktop\sheet\correct\back_sizes\20180719004308818_0030.jpg']
+    template_img = read_single_img(template_path)
+    y, x = template_img.shape[0], template_img.shape[1]
+    ocr_classes_dict = []
+    # ocr_class = ['info_title', 'page']
+    ocr_class = {'front': ['info_title', 'page'], 'back': ['solve', 'solve0', 'page']}
+    page_index = 1
+    for ele in sheet_dict['regions']:
+        if ele['class_name'] in ocr_class[sheet_sides]:
+            ocr_classes_dict.append({'class_name': '{}_{}'.format(ele['class_name'], str(page_index)), 'region': ele['bounding_box']})
+            page_index += 1
+
+    template_info_dict = get_template(template_img, ocr_classes_dict, method)
+    save_transformed_img((x, y), correcting_img_path_list, ocr_classes_dict, correcting_img_dir_path, corrected_img_save_dir, template_info_dict, method)
+
+
+if __name__ == '__main__':
+    template_path0 = r'C:\Users\Administrator\Desktop\sheet\correct\back_sizes\template\20180719004308818_0020.jpg'
+    img_dir_path = r'C:\Users\Administrator\Desktop\sheet\correct\back_sizes'
+    img_save_dir = r'C:\Users\Administrator\Desktop\sheet\correct\back_sizes\corrected'
+    xml_path = template_path0.replace('.jpg', '.xml')
+    sheet_dict0 = read_xml_to_json(xml_path)
+    correct(template_path0, img_dir_path, img_save_dir, sheet_dict0, sheet_sides='back', method='baidu')

+ 479 - 0
segment/sheet_resolve/analysis/correct/coordinates_correct_pyinstaller.py

@@ -0,0 +1,479 @@
+# @Author  : lightXu
+# @File    : coordinates_correct_pyinstaller.py
+# @Time    : 2018/12/10 0010 上午 10:26
+import os
+import argparse
+import cv2
+import traceback
+import numpy as np
+import glob2 as glob
+import xml.etree.cElementTree as ET
+import requests
+import base64
+from urllib import parse, request
+
+
+access_token = '24.214174608e47e6047f31c3fd8c3cedef.2592000.1548390126.282335-14614857'
+# access_token = ocr_login()
+OCR_BOX_URL = 'https://aip.baidubce.com/rest/2.0/ocr/v1/'
+OCR_URL = 'https://aip.baidubce.com/rest/2.0/ocr/v1/'
+# OCR_ACCURACY = 'general'
+OCR_ACCURACY = 'accurate'
+OCR_CLIENT_ID = 'AVH7VGKG8QxoSotp6wG9LyZq'
+OCR_CLIENT_SECRET = 'gG7VYvBWLU8Rusnin8cS8Ta4dOckGFl6'
+OCR_TOKEN_UPDATE_DATE = 10
+
+
+def login():
+    grant_type = 'client_credentials'
+    client_id = OCR_CLIENT_ID
+    client_secret = OCR_CLIENT_SECRET
+
+    textmod = {'grant_type': grant_type, 'client_id': client_id, 'client_secret': client_secret}
+    textmod = parse.urlencode(textmod)
+
+    # 输出内容:user=admin&password=admin
+    header_dict = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'}
+    url = 'https://aip.baidubce.com/oauth/2.0/token'
+    req = request.Request(url='{}{}{}'.format(url, '?', textmod), headers=header_dict)
+    res = request.urlopen(req).read()
+    token = eval(res.decode(encoding='utf-8'))['access_token']
+    return token
+
+
+def opencv2base64(img):
+    image = cv2.imencode('.jpg', img)[1]
+    base64_data = str(base64.b64encode(image))[2:-1]
+    return base64_data
+
+
+def get_ocr_raw_result(img, ocr_accuracy=OCR_ACCURACY, language_type='CHN_ENG'):
+    textmod = {'access_token': login()}
+    textmod = parse.urlencode(textmod)
+    url = '{}{}{}{}'.format(OCR_BOX_URL, ocr_accuracy, '?', textmod)
+    url_general = '{}{}{}{}'.format(OCR_BOX_URL, 'general', '?', textmod)
+
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    image_type = 'base64'
+    group_id = 'group001'
+    user_id = 'usr001'
+
+    image = opencv2base64(img)
+
+    data = {
+        'image_type': image_type,
+        'group_id': group_id,
+        'user_id': user_id,
+        'image': image,
+        'detect_direction': 'true',
+        'recognize_granularity': 'small',
+        'language_type': language_type,
+        # 'vertexes_location': 'true',
+        # 'probability': 'true'
+    }
+
+    resp = requests.post(url, data=data, headers=headers).json()
+    if resp.get('error_msg'):
+        if 'internal error' in resp.get('error_msg'):
+            resp = requests.post(url_general, data=data, headers=headers).json()
+            if resp.get('error_msg'):
+                raise Exception("ocr {}!".format(resp.get('error_msg')))
+        else:
+            raise Exception("ocr {}!".format(resp.get('error_msg')))
+
+    return resp
+
+
+def get_ocr_text_and_coordinate_in_google_format(img, ocr_accuracy=OCR_ACCURACY, language_type='CHN_ENG'):
+    textmod = {'access_token': access_token}
+    textmod = parse.urlencode(textmod)
+    url = '{}{}{}{}'.format(OCR_BOX_URL, ocr_accuracy, '?', textmod)
+    url_general = '{}{}{}{}'.format(OCR_BOX_URL, 'general', '?', textmod)
+
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    image_type = 'base64'
+    group_id = 'group001'
+    user_id = 'usr001'
+
+    image = opencv2base64(img)
+
+    data = {
+        'image_type': image_type,
+        'group_id': group_id,
+        'user_id': user_id,
+        'image': image,
+        'detect_direction': 'true',
+        'recognize_granularity': 'small',
+        'language_type': language_type,
+        # 'vertexes_location': 'true',
+        # 'probability': 'true'
+    }
+
+    resp = requests.post(url, data=data, headers=headers).json()
+    if resp.get('error_msg'):
+        if 'internal error' in resp.get('error_msg'):
+            resp = requests.post(url_general, data=data, headers=headers).json()
+            if resp.get('error_msg'):
+                raise Exception("ocr {}!".format(resp.get('error_msg')))
+        else:
+            raise Exception("ocr {}!".format(resp.get('error_msg')))
+
+    words_result = resp.get('words_result')
+    dict_list = [item2.get('location') for item in words_result for item2 in item['chars']]
+    char_list = [item2.get('char') for item in words_result for item2 in item['chars']]
+    matrix = []
+    for ele in dict_list:
+        xmin = ele['left']
+        ymin = ele['top']
+        xmax = ele['width'] + ele['left']
+        ymax = ele['top'] + ele['height']
+        item0 = (xmin, ymin, xmax, ymax)
+        matrix.append(item0)
+
+    res_dict = {'chars': char_list, 'coordinates': matrix}
+    return res_dict
+
+
+def read_xml_to_json(xml_path):
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    regions_list = []
+    for obj in root.findall('object'):
+        class_name = obj.find('name').text
+        bbox = obj.find('bndbox')
+        xmin = int(bbox.find('xmin').text)
+        ymin = int(bbox.find('ymin').text)
+        xmax = int(bbox.find('xmax').text)
+        ymax = int(bbox.find('ymax').text)
+        bbox_dict = {'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax}
+        region = {'class_name': class_name, 'bounding_box': bbox_dict}
+        regions_list.append(region)
+
+    serial = '201812260000001'
+    sheet_dict = {'series_number': serial, 'regions': regions_list}
+    return sheet_dict
+
+
+def crop_region(im, bbox):
+    xmin = int(bbox['xmin'])
+    ymin = int(bbox['ymin'])
+    xmax = int(bbox['xmax'])
+    ymax = int(bbox['ymax'])
+
+    region = im[ymin:ymax, xmin:xmax]
+    return region
+
+
+def write_single_img(dst, save_path):
+    try:
+        cv2.imencode('.jpg', dst)[1].tofile(save_path)
+    except FileNotFoundError as e:
+        raise e
+
+
+def read_single_img(img_path):
+    try:
+        im = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), -1)
+    except FileNotFoundError as e:
+        raise e
+    return im
+
+
+def transform(template_img_size, correcting_img, correcting_pts, template_pts):
+    pts1 = np.float32(correcting_pts)  # 原始坐标
+    pts2 = np.float32(template_pts)  # 目标坐标
+
+    mtx = cv2.getAffineTransform(pts1, pts2)
+    dst = cv2.warpAffine(correcting_img, mtx, template_img_size, borderValue=(0, 0, 255))
+    return dst
+
+
+def get_same_str(str1, str2):
+    str1_set = set(str1)
+    str2_set = set(str2)
+    intersection = str1_set & str2_set
+    if intersection:
+        len1 = len(str1)
+        len2 = len(str2)
+        if len2 == 1:
+            start_index = str1.index(str2)
+            return {'ismatch': True, 'coordinates': ((start_index, start_index + 1), (0, 1))}
+        else:
+            str_set = set(str1 + str2)
+            str_set_dict = {}
+            for i, ele in enumerate(sorted(list(str_set))):
+                str_set_dict[ele] = i + 1
+
+            str1_np = np.asarray([str_set_dict[k] for k in str1])
+            str2_np = np.asarray([str_set_dict[k] for k in str2])
+
+            np1 = np.tile(str1_np, (len2, 1))
+            np2 = np.tile(str2_np, (1, len1)).reshape(len1, len2).T
+
+            np3 = np1 - np2
+
+            size = np3.shape
+
+            np4 = np3.reshape(-1, 1)
+
+            np4_list = np4.tolist()
+            zero_list = list()
+            str_index = []
+            for i, ele in enumerate(np4):
+                if i in zero_list:
+                    continue
+                else:
+                    if ele == [0]:
+                        length = 0
+                        zero_list.append(i)
+                        for interval in range(1, size[1]):
+                            next_index = i + size[1] * interval + interval
+                            if next_index < len(np4_list):
+                                if np4_list[next_index] == [0]:
+                                    length += 1
+                                    zero_list.append(next_index)  # 跳过的循环
+                                if np4_list[next_index] != [0]:
+                                    break
+                        str_index.append((i, i + size[1] * length + length, length))
+                    else:
+                        pass
+
+            # print(str_index)
+            # print(max_index[0])
+            max_index = sorted(str_index, key=lambda k: k[2], reverse=True)
+            a = (max_index[0][0] // size[1], max_index[0][0] % size[1])
+            b = (max_index[0][1] // size[1], max_index[0][1] % size[1])
+
+            # max_str1 = str1[a[1]:b[1] + 1]
+            # max_str2 = str2[a[0]:b[0] + 1]
+            # print(max_str1, max_str2)
+
+            return {'ismatch': True, 'coordinates': ((a[1], b[1] + 1), (a[0], b[0] + 1))}
+
+    else:
+        return {'ismatch': False, 'coordinates': ()}
+
+
+def match_string(correcting_ocr, correcting_bias, template_ocr, template_bias):
+    correcting_words_list = correcting_ocr['chars']
+    template_words_list = template_ocr['chars']
+
+    max_same_str = ''
+    index_pair = {}
+
+    longer = ''.join(correcting_words_list)
+    shorter = ''.join(template_words_list)
+
+    res = get_same_str(longer, shorter)
+    if res['ismatch']:
+        c, t = res['coordinates']
+        if len(max_same_str) < c[1]-c[0]:
+            max_same_str = longer[c[0]:c[1]]
+            index_pair['correcting'] = (c[0], c[1])
+            index_pair['template'] = (t[0], t[1])
+
+    if len(index_pair) > 0:
+        correcting_coordinate = correcting_ocr['coordinates'][index_pair['correcting'][0]]  # xmin, ymin, xmax, ymax
+        c_x = correcting_bias[0] + correcting_coordinate[0]
+        c_y = correcting_bias[1] + correcting_coordinate[1]
+        template_coordinate = template_ocr['coordinates'][index_pair['template'][0]]
+        t_x, t_y = template_bias[0] + template_coordinate[0], template_bias[1] + template_coordinate[1]
+
+        print(max_same_str)
+        if abs(c_x-t_x) < 50 and abs(c_y-t_y) < 50:
+            return {'correcting': (c_x, c_y), 'template': (t_x, t_y)}
+        else:
+            return {}
+    else:
+        return {}
+
+
+def get_template(template_img, ocr_classes_dict):
+    template_dict = {}
+
+    for ocr in ocr_classes_dict:
+        template = {}
+        class_name = ocr['class_name']
+        if 'solve' in class_name:
+            # ocr['region']['ymax'] = int(0.10 * (ocr['region']['ymax']-ocr['region']['ymin']) +ocr['region']['ymin'])
+            ocr['region']['ymax'] = int(250 + ocr['region']['ymin'])
+        ocr_box = ocr['region']
+        left, top = ocr_box['xmin'], ocr_box['ymin']
+
+        ocr_img = crop_region(template_img, ocr_box)
+        # cv2.imshow(class_name, ocr_img)
+        # if cv2.waitKey(0) == 27:
+        #     cv2.destroyAllWindows()
+
+        ocr_word = get_ocr_text_and_coordinate_in_google_format(ocr_img)  # baidu
+        template['words_result'] = ocr_word
+        template['coordinate_bias'] = (left, top)
+
+        template_dict[class_name] = template
+    return template_dict
+
+
+def get_correct_points(points_list, size):
+    if len(points_list) > size >= 3:
+        points_list = sorted(points_list, key=lambda k: k[1])
+        i = 1
+        choice_index_list = [0]
+        for index, ele in enumerate(points_list):
+            if abs(ele[1] - points_list[0][1]) > 500:  # 找y轴差值大于500的坐标
+                choice_index_list.append(index)
+                i = i + 1
+                if i == size:
+                    break
+        return choice_index_list
+    elif len(points_list) == 3:
+        return [0, 1, 2]
+    else:
+        raise Exception
+
+
+def save_transformed_img(template_img_size, images_path_list, ocr_classes_dict,
+                         correcting_img_dir_path, corrected_img_save_dir, template_dict):
+    for img_path in images_path_list:
+        correcting_img = read_single_img(img_path)
+        print('***********************************')
+        print(img_path)
+        correct_coordinates_list = []
+        template_coordinates_list = []
+        try:
+            for ocr in ocr_classes_dict:
+                class_name = ocr['class_name']
+                if 'solve' in class_name:
+                    ocr['region']['ymax'] = ocr['region']['ymax']
+                ocr_box = ocr['region']
+                left, top = ocr_box['xmin'], ocr_box['ymin']
+
+                correcting_oct_region = crop_region(correcting_img, ocr_box)
+                # cv2.imwrite(class_name+'.jpg', correcting_oct_region)
+                # cv2.imshow(class_name, correcting_oct_region)
+                # if cv2.waitKey(0) == 27:
+                #     cv2.destroyAllWindows()
+                correcting_word = get_ocr_text_and_coordinate_in_google_format(correcting_oct_region)  # baidu
+                template = template_dict[class_name]
+                template_word = template['words_result']
+                template_bias = template['coordinate_bias']
+
+                coordinates_dict = match_string(correcting_word, (left, top), template_word, template_bias)
+                if len(coordinates_dict) > 0:
+                    correct_coordinates_list.append(coordinates_dict['correcting'])
+                    template_coordinates_list.append(coordinates_dict['template'])
+                else:
+                    continue
+
+            choice_index_list = get_correct_points(correct_coordinates_list, 3)
+            c_coordinates = [correct_coordinates_list[ele] for ele in choice_index_list]
+            t_coordinates = [template_coordinates_list[ele] for ele in choice_index_list]
+            dst = transform(template_img_size, correcting_img,
+                            c_coordinates, t_coordinates)
+            print(c_coordinates, t_coordinates)
+            save_path = img_path.replace(correcting_img_dir_path, corrected_img_save_dir)
+            write_single_img(dst, save_path)
+            print(save_path)
+        except Exception as e:
+            print('image corrected error: {}'.format(e))
+            traceback.print_exc()
+
+
+def correct(template_path, correcting_img_dir_path, corrected_img_save_dir, sheet_dict, sheet_sides='front'):
+    find_str = os.path.join(correcting_img_dir_path, '*.jpg')
+    correcting_img_path_list = glob.glob(find_str)
+    # correcting_img_path_list = [r'C:\Users\Administrator\Desktop\sheet\correct\back_sizes\20180719004308818_0030.jpg']
+    template_img = read_single_img(template_path)
+    y, x = template_img.shape[0], template_img.shape[1]
+    ocr_classes_dict = []
+    # ocr_class = ['info_title', 'page']
+    ocr_class = {'front': ['info_title', 'page'], 'back': ['solve', 'solve0', 'page']}
+    page_index = 1
+    for ele in sheet_dict['regions']:
+        if ele['class_name'] in ocr_class[sheet_sides]:
+            ocr_classes_dict.append({'class_name': '{}_{}'.format(ele['class_name'], str(page_index)),
+                                     'region': ele['bounding_box']})
+            page_index += 1
+
+    template_info_dict = get_template(template_img, ocr_classes_dict)
+    save_transformed_img((x, y), correcting_img_path_list, ocr_classes_dict,
+                         correcting_img_dir_path, corrected_img_save_dir, template_info_dict)
+
+
+def rotate(image, angle, center=None, scale=1.0):
+    (h, w) = image.shape[:2]
+    if center is None:
+        center = (w // 2, h // 2)
+
+    mtx = cv2.getRotationMatrix2D(center, angle, scale)
+    rotated = cv2.warpAffine(image, mtx, (w, h))
+    return rotated
+
+
+def image_direction(image_raw, standard_direction):
+    # 图片进来取上下两部分,baidu_ocr判断方向
+    height, width = image_raw.shape[0], image_raw.shape[1]
+    image = ''
+    if height > width:
+        if standard_direction == 'V':
+            image = image_raw
+        if standard_direction == 'H':
+            image = np.rot90(image_raw)
+    if height <= width:
+        if standard_direction == 'V':
+            image = np.rot90(image_raw)
+        if standard_direction == 'H':
+            image = image_raw
+
+    crop_ratio = 0.1
+    crop_height = int(crop_ratio * height)
+    top_part = image[:crop_height, :]
+    bottom_part = image[height-crop_height:height, :]
+    ocr_used_image = np.vstack([top_part, bottom_part])
+    # - -1:未定义,
+    # - 0:正向,
+    # - 1: 逆时针90度,
+    # - 2:逆时针180度,
+    # - 3:逆时针270度
+    direction = get_ocr_raw_result(ocr_used_image)['direction']
+    if direction == 2:
+        # flip:  1 水平翻转, 0 垂直翻转, -1 水平垂直翻转
+        fliped_image = cv2.flip(image, 1)
+    if direction == 0:
+        fliped_image = image
+    else:
+        raise ValueError("direction={} is not supported!".format(direction))
+
+    # 判断是该套试卷中的第几份:识别页码?
+
+
+def run():
+    parser = argparse.ArgumentParser(description="your script description")  # --help
+    parser.add_argument('--template', '-t', help='reviewed template file path')
+    parser.add_argument('--page', '-p', choices=['front', 'back'], help='front sizes or back sizes')
+    parser.add_argument('--raw', '-r', help='raw images folder path')
+    parser.add_argument('--save', '-s', help='corrected images save folder path')
+    # description参数可以用于插入描述脚本用途的信息,可以为空
+
+    args = parser.parse_args()  # 将变量以标签-值的字典形式存入args字典
+
+    template = args.template
+    raw_dir = args.raw
+    page = args.page
+    save_dir = args.save
+
+    xml_path = template.replace('.jpg', '.xml')
+    sheet_dict0 = read_xml_to_json(xml_path)
+    correct(template, raw_dir, save_dir, sheet_dict0, sheet_sides=page)
+
+
+if __name__ == '__main__':
+    run()
+    # template_path0 = r'C:\Users\Administrator\Desktop\sheet\correct\back_sizes\template\20180719004308818_0020.jpg'
+    # img_dir_path = r'C:\Users\Administrator\Desktop\sheet\correct\back_sizes'
+    # img_save_dir = r'C:\Users\Administrator\Desktop\sheet\correct\back_sizes\corrected'
+    # xml_path = template_path0.replace('.jpg', '.xml')
+    # print('hello', xml_path)
+    # sheet_dict0 = read_xml_to_json(xml_path)
+    # correct(template_path0, img_dir_path, img_save_dir, sheet_dict0, sheet_sides='back')

+ 5 - 0
segment/sheet_resolve/analysis/correct/run.bat

@@ -0,0 +1,5 @@
+python coordinates_correct_pyinstaller.py
+--template=C:\Users\Administrator\Desktop\sheet\correct\back_sizes\template\20180719004308818_0020.jpg
+--page=back
+--raw=C:\Users\Administrator\Desktop\sheet\correct\back_sizes
+--save=C:\Users\Administrator\Desktop\sheet\correct\back_sizes\corrected

+ 3 - 0
segment/sheet_resolve/analysis/exam_number/__init__.py

@@ -0,0 +1,3 @@
+# @Author  : lightXu
+# @File    : __init__.py.py
+# @Time    : 2018/11/21 0021 下午 16:01

+ 239 - 0
segment/sheet_resolve/analysis/exam_number/exam_number_box.py

@@ -0,0 +1,239 @@
+# @Author  : lightXu
+# @File    : exam_number_box.py
+# @Time    : 2018/11/22 0022 下午 15:59
+import cv2
+import numpy as np
+import xml.etree.cElementTree as ET
+from segment.sheet_resolve.tools import utils
+from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate, get_ocr_text_and_coordinate_direction
+import re
+
+
+def preprocess(img, xe, ye):
+    scale = 0
+    dilate = 1
+    blur = 5
+    # 预处理图像
+    # img = cv2.imread(picture)
+
+    # rescale the image
+    if scale != 0:
+        img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+
+    # Convert to gray
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    # # Apply dilation and erosion to remove some noise
+    # if dilate != 0:
+    #     kernel = np.ones((dilate, dilate), np.uint8)
+    #     img = cv2.dilate(img, kernel, iterations=1)
+    #     img = cv2.erode(img, kernel, iterations=1)
+
+    # Apply blur to smooth out the edges
+    # if blur != 0:
+    #     img = cv2.GaussianBlur(img, (blur, blur), 0)
+
+    # Apply threshold to get image with only b&w (binarization)
+    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+
+    kernel = np.ones((ye, xe), np.uint8)  # y轴膨胀, x轴膨胀
+
+    dst = cv2.dilate(img, kernel, iterations=1)
+    # cv2.imshow('dilate', dst)
+    # if cv2.waitKey(0) == 27:
+    #     cv2.destroyAllWindows()
+
+    return dst
+
+
+def contours(image):
+    _, cnts, hierarchy = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    bboxes = []
+    for cnt_id, cnt in enumerate(reversed(cnts)):
+        x, y, w, h = cv2.boundingRect(cnt)
+        bboxes.append((x, y, x + w, y + h))
+
+    return bboxes
+
+
+def box_coordinates(img):
+    img_arr = np.asarray(img)
+
+    def axix_break_point(img, tolerance_number, axis):
+        sum_x_axis = img.sum(axis=axis)
+        sum_x_axis[sum_x_axis > 255 * tolerance_number] = 1  # 白色有字
+        sum_x_axis[sum_x_axis != 1] = 0  # 黑色无字
+        sum_x_axis_list = list(sum_x_axis)
+        sum_x_axis_list.append(0)  # 最后几行到结束有字时,使索引值增加最后一位
+
+        split_x_index = []
+        num = 1
+        for index, ele in enumerate(sum_x_axis_list):
+            num = num % 2
+            if ele == num:
+                # print(i)
+                num = num + 1
+                split_x_index.append(index)
+        # print('length: ', len(split_x_index), split_x_index)
+        return split_x_index
+
+    y_break_points_list = axix_break_point(img_arr, 1, axis=1)  # y轴分组
+    img_arr_upper = img_arr[:y_break_points_list[1], :]
+
+    # cv2.imshow('img_arr_upper', img_arr_upper)
+    # if cv2.waitKey(0) == 27:
+    #     cv2.destroyAllWindows()
+
+    x_break_points_list = axix_break_point(img_arr_upper, 1, axis=0)
+    if len(x_break_points_list) <= 4:
+        hand_writing = True
+    else:
+        hand_writing = False
+
+    img_arr_for_x = img_arr
+    ocr_region = img_arr_upper
+    if hand_writing:  # 存在手写考号区域
+        ocr_region = img_arr[y_break_points_list[2]:y_break_points_list[3], :]
+        y_break_points_list = y_break_points_list[2:]
+        img_arr_for_x = img_arr[y_break_points_list[1]:, :]
+    x_break_points_list = axix_break_point(img_arr_for_x, 1, axis=0)
+
+    all_coordinates = []
+    row_number = 0
+    for i in range(0, len(y_break_points_list), 2):  # y轴分组
+        ymin = y_break_points_list[i]
+        ymax = y_break_points_list[i + 1]
+        matrix = np.array([0, 0, 0, 0])
+        if ymax-ymin > 3:  # 过滤噪音
+            for j in range(0, len(x_break_points_list), 2):
+                xmin = x_break_points_list[j]
+                xmax = x_break_points_list[j + 1]
+                if xmax - xmin > 3:
+                    matrix = np.vstack([matrix, np.array([xmin, ymin, xmax, ymax])])
+
+            matrix = matrix[1:, :]
+            dif = matrix[1:, 0] - matrix[:-1, 2]  # 后一个char的left与起一个char的right的差
+            dif[dif < 0] = 0
+            dif_length = np.mean(dif)  # 小于平均间隔的合并
+            block_list = utils.box_by_x_intervel(matrix, dif_length)
+
+            row = {'row': '{}'.format(row_number), 'coordinates': block_list}
+            all_coordinates.append(row)
+            row_number += 1
+
+    # 识别文字和朝向
+    try:
+        word_result_list, _ = get_ocr_text_and_coordinate_direction(ocr_region)
+    except Exception:
+        word_result_list, _ = get_ocr_text_and_coordinate_direction(img_arr_for_x)
+
+    direction = 180
+    if len(word_result_list) > 0:
+        all_char_list = []
+        digital_model = re.compile(r'\d')
+        for i, chars_dict in enumerate(word_result_list):
+            chars_list = chars_dict['chars']
+            for ele in chars_list:
+                if digital_model.search(ele['char']):
+                    all_char_list.append(int(ele['char']))
+
+        if sum(all_char_list) < 45//2:
+            direction = 180
+        else:
+            direction = 90
+    return all_coordinates, direction
+
+
+def exam_number(left, top, image, xml_path):
+    img = preprocess(image, 3, 3)
+
+    box_list, _ = box_coordinates(img)
+
+    exam_bbox_list = []
+    tree = ET.parse(xml_path)  # xml tree
+    for index_num, exam_bbox in enumerate(box_list):
+        row_number = exam_bbox['row']
+        coordinates = exam_bbox['coordinates']
+        ii = 0
+        for i, coordinate in enumerate(coordinates):
+            area = (coordinate[2] - coordinate[0]) * (coordinate[3] - coordinate[1])
+            if area > 400:
+                number = '{:02d}_{}'.format(ii, row_number)
+                tree = utils.create_xml(number, tree,
+                                        coordinate[0]+left, coordinate[1]+top, coordinate[2]+left, coordinate[3]+top)
+
+                region = [coordinate[0]+left, coordinate[1]+top, coordinate[2]+left, coordinate[3]+top]
+                exam_bbox_list.append({'number': number, 'region': region})
+                ii = ii + 1
+    # print(exam_items_bbox)
+    tree.write(xml_path)
+    return exam_bbox_list
+
+
+def exam_number_column(left, top, image, xml_path):
+    img = preprocess(image, 3, 3)
+
+    box_list, _ = box_coordinates(img)
+
+    column_number = len(box_list[0]['coordinates'])
+
+    tree = ET.parse(xml_path)  # xml tree
+    column_list = []
+    for i in range(0, column_number):
+        matrix = np.array([0, 0, 0, 0])
+        for coord in box_list:
+            col = coord['coordinates']
+            matrix = np.vstack([matrix, np.array(col[i])])
+
+        combine = matrix[1:]
+        min_temp = np.min(combine, axis=0)
+        max_temp = np.max(combine, axis=0)
+        column_coordinate = {'xmin': min_temp[0]+left, 'ymin': min_temp[1]+top,
+                             'xmax': max_temp[2]+left, 'ymax': max_temp[3]+top}
+        single_height = np.mean(combine[:, 3]-combine[:, 1])
+        single_width = np.mean(combine[:, 2]-combine[:, 0])
+
+        column_dict = {'number': i, 'location': column_coordinate,
+                       'single_height': int(single_height),
+                       'single_width': int(single_width),
+                       "choice_option": "0,1,2,3,4,5,6,7,8,9",
+                       'row': 10, 'column': 1}
+        column_list.append(column_dict)
+        tree = utils.create_xml(str(i), tree,
+                                column_coordinate['xmin'], column_coordinate['ymin'],
+                                column_coordinate['xmax'], column_coordinate['ymax'])
+
+    return column_list
+
+
+def exam_number_whole(left, top, image, xml_path):
+    img = preprocess(image, 3, 3)
+    box_list, direction = box_coordinates(img)
+
+    coor = [coord['coordinates'] for coord in box_list]
+    column_number = len(box_list[0]['coordinates'])
+    row_number = len(box_list)
+
+    tensor = np.asarray(coor).reshape(column_number*row_number, 4)
+    min_temp = np.min(tensor, axis=0)
+    max_temp = np.max(tensor, axis=0)
+    column_coordinate = {'xmin': int(min_temp[0] + left), 'ymin': int(min_temp[1] + top),
+                         'xmax': int(max_temp[2] + left), 'ymax': int(max_temp[3] + top)}
+
+    single_height = np.mean(tensor[:, 3] - tensor[:, 1])
+    single_width = np.mean(tensor[:, 2] - tensor[:, 0])
+
+    column_dict = {'location': column_coordinate,
+                   'single_height': int(single_height),
+                   'single_width': int(single_width),
+                   "choice_option": "0,1,2,3,4,5,6,7,8,9",
+                   'row': row_number, 'column': column_number,
+                   'direction': direction}
+    tree = ET.parse(xml_path)  # xml tree
+    tree = utils.create_xml('exam_number', tree,
+                            column_coordinate['xmin'], column_coordinate['ymin'],
+                            column_coordinate['xmax'], column_coordinate['ymax'])
+
+    tree.write(xml_path)
+    return column_dict

+ 234 - 0
segment/sheet_resolve/analysis/exam_number/exam_number_row_column.py

@@ -0,0 +1,234 @@
+import numpy as np
+import tensorflow as tf
+
+from segment.sheet_resolve.lib.ssd_model.utils import label_map_util, ops as utils_ops
+from segment.sheet_resolve.tools import tf_settings
+from segment.sheet_resolve.tools.tf_sess import SsdSess
+
+from PIL import Image
+import math
+
+tf_sess_dict = {
+    'exam_number_ssd': SsdSess('exam_number_ssd'),
+}
+
+exam_number_sess = tf_sess_dict['exam_number_ssd']
+sess = exam_number_sess.sess
+detection_graph = exam_number_sess.graph
+
+
+def load_image_into_numpy_array(image):
+    # print(image)
+    image = image.convert('RGB')
+    (im_width, im_height) = image.size
+    return np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8)
+
+
+def run_inference_for_single_image(image):
+    ops = detection_graph.get_operations()
+    all_tensor_names = {output.name for op in ops for output in op.outputs}
+    tensor_dict = {}
+    for key in [
+        'num_detections', 'detection_boxes', 'detection_scores',
+        'detection_classes', 'detection_masks'
+    ]:
+        tensor_name = key + ':0'
+        if tensor_name in all_tensor_names:
+            tensor_dict[key] = detection_graph.get_tensor_by_name(
+                tensor_name)
+    if 'detection_masks' in tensor_dict:
+        # The following processing is only for single image
+        detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
+        detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
+        # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
+        real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
+        detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
+        detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
+        detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
+            detection_masks, detection_boxes, image.shape[0], image.shape[1])
+        detection_masks_reframed = tf.cast(
+            tf.greater(detection_masks_reframed, 0.5), tf.uint8)
+        # Follow the convention by adding back the batch dimension
+        tensor_dict['detection_masks'] = tf.expand_dims(
+            detection_masks_reframed, 0)
+    image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
+
+    # Run inference
+    # start = time.time()
+    output_dict = sess.run(tensor_dict,
+                           feed_dict={image_tensor: np.expand_dims(image, 0)})
+    # print(time.time()-start)
+    # all outputs are float32 numpy arrays, so convert types as appropriate
+    output_dict['num_detections'] = int(output_dict['num_detections'][0])
+    output_dict['detection_classes'] = output_dict[
+        'detection_classes'][0].astype(np.uint8)
+    output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
+    output_dict['detection_scores'] = output_dict['detection_scores'][0]
+    if 'detection_masks' in output_dict:
+        output_dict['detection_masks'] = output_dict['detection_masks'][0]
+    return output_dict
+
+
+def image_detect(image_np, category, score_threshold):
+    image_np = load_image_into_numpy_array(image_np)
+    detections = []
+    w, h = image_np.shape[1], image_np.shape[0]
+    output_dict = run_inference_for_single_image(image_np)
+    boxes = output_dict['detection_boxes']
+    scores = output_dict['detection_scores']
+    labels = output_dict['detection_classes']
+    indices = np.where(scores > score_threshold)
+    image_scores = scores[indices]
+    image_boxes = boxes[indices]
+    image_labels = labels[indices]
+    image_detections = np.concatenate(
+        [image_boxes, np.expand_dims(image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1)
+    for detection in image_detections:
+        y0 = int(detection[0] * h)
+        x0 = int(detection[1] * w)
+        y1 = int(detection[2] * h)
+        x1 = int(detection[3] * w)
+        label_index = int(detection[5])
+        label_name = category[label_index]['name']
+        detections.append((x0, y0, x1, y1, label_index, detection[4], label_name))
+    return detections
+
+
+def get_exam_number_row_and_col(left, top, image):
+    im_resize = 512
+    ''' exam_number resize to 512*512'''
+    image_src = Image.fromarray(image)
+    if image_src.mode == 'RGB':
+        image_src = image_src.convert("L")
+    w, h = image_src.size
+    if h > w:
+        image_src = image_src.resize((int(im_resize / h * w), im_resize))
+    else:
+        image_src = image_src.resize((im_resize, int(im_resize / w * h)))
+    w_, h_ = image_src.size
+    image_512 = Image.new(image_src.mode, (im_resize, im_resize), (255))
+    image_512.paste(image_src, [0, 0, w_, h_])
+
+    n_z = "0123456789"
+    category_index = label_map_util.create_category_index_from_labelmap(tf_settings.exam_number_ssd_label,
+                                                                        use_display_name=True)
+    detections = image_detect(image_512, category_index, 0.5)
+    if len(detections):
+        box_xmin = []
+        box_ymin = []
+        box_xmax = []
+        box_ymax = []
+        x_distance_all = []
+        y_distance_all = []
+        x_width_all = []
+        y_height_all = []
+        all_small_coordinate = []
+        border = {}
+        exam_number_ssd = {}
+        ssd_column = 1
+        ssd_row = 1
+
+        for index, box in enumerate(detections):
+            box0 = round(box[0] * (w / w_))  # Map to the original image
+            box1 = round(box[1] * (h / h_))
+            box2 = round(box[2] * (w / w_))
+            box3 = round(box[3] * (h / h_))
+            if box[-1] == 'border':
+                border = {'xmin': box0,
+                          'ymin': box1,
+                          'xmax': box2,
+                          'ymax': box3
+                          }
+            # if box[2] - box[0] > 80 or box[3] - box[1] >80:
+            #    continue
+            else:
+                box_xmin.append(box0)
+                box_ymin.append(box1)
+                box_xmax.append(box2)
+                box_ymax.append(box3)
+
+                small_coordinate = {'xmin': box0 + left,
+                                    'ymin': box1 + top,
+                                    'xmax': box2 + left,
+                                    'ymax': box3 + top}
+                all_small_coordinate.append(small_coordinate)
+                x_width = box2 - box0
+                y_height = box3 - box1
+                x_width_all.append(x_width)
+                y_height_all.append(y_height)
+
+        sorted_xmin = sorted(box_xmin)
+        sorted_ymin = sorted(box_ymin)
+        sorted_xmax = sorted(box_xmax)
+        sorted_ymax = sorted(box_ymax)
+        # print(sorted_xmin, sorted_ymin)
+        x_width_all_sorted = sorted(x_width_all, reverse=True)
+        y_height_all_sorted = sorted(y_height_all, reverse=True)
+        len_x = len(x_width_all)
+        len_y = len(y_height_all)
+        x_width_median = np.median(x_width_all_sorted)
+        y_height_median = np.median(y_height_all_sorted)
+
+        for i in range(len(sorted_xmin) - 1):
+            x_distance = sorted_xmin[i + 1] - sorted_xmin[i]
+            y_distance = sorted_ymin[i + 1] - sorted_ymin[i]
+            if x_distance > (x_width_median - 5):
+                ssd_column = ssd_column + 1
+                x_distance_all.append(x_distance)
+            if y_distance > (y_height_median - 5):
+                ssd_row = ssd_row + 1
+                y_distance_all.append(y_distance)
+
+            # del the  borders where small items are too large
+            if x_width_all_sorted[i] - x_width_median > x_width_median:
+                ssd_column = ssd_column - 1
+            elif x_width_median - x_width_all_sorted[i] > x_width_median:
+                ssd_column = ssd_column - 1
+            if y_height_all_sorted[i] - y_height_median > y_height_median:
+                ssd_row = ssd_row - 1
+            elif y_height_median - y_height_all_sorted[i] > y_height_median:
+                ssd_row = ssd_row - 1
+
+        # Add rows and columns that might be missed
+        x_distance_all_sorted = sorted(x_distance_all, reverse=True)
+        y_distance_all_sorted = sorted(y_height_all, reverse=True)
+        len_x_distance = len(x_distance_all)
+        len_y_distance = len(y_distance_all)
+        x_distance_median = np.median(x_distance_all_sorted)
+        y_distance_median = np.median(y_distance_all_sorted)
+        for i in range(len_x_distance):
+            if x_distance_all[i] > 2 * x_distance_median - 4:
+                ssd_column = ssd_column + 1
+        for i in range(len_y_distance):
+            if y_distance_all[i] > 2 * y_distance_median - 4:
+                ssd_row = ssd_row + 1
+
+        if ssd_row < 10:
+            test = math.ceil(len_y / ssd_column)
+            if test > ssd_row:
+                ssd_row = test
+        if ssd_row > 10:
+            ssd_row = 10
+
+        average_height = int(np.mean(y_height_all))
+        average_width = int(np.mean(x_width_all))
+
+        location_ssd = {'xmin': sorted_xmin[0] + left,
+                        'ymin': sorted_ymin[0] + top,
+                        'xmax': sorted_xmax[-1] + left,
+                        'ymax': sorted_ymax[-1] + top}
+
+        exam_number_ssd = {'bounding_box': location_ssd,
+                           "single_height": average_height,
+                           "single_width": average_width,
+                           "rows": ssd_row,
+                           "cols": ssd_column,
+                           "option": n_z[:ssd_row].replace('', ',')[1:-1],
+                           "direction": 180,
+                           'class_name': 'exam_number_col_row',
+                           'all_small_coordinate': all_small_coordinate
+                           }
+    else:
+        exam_number_ssd = {}
+
+    return exam_number_ssd

+ 3 - 0
segment/sheet_resolve/analysis/info_section/__init__.py

@@ -0,0 +1,3 @@
+# @Author  : lightXu
+# @File    : __init__.py.py
+# @Time    : 2018/11/21 0021 下午 16:01

+ 43 - 0
segment/sheet_resolve/analysis/info_section/info_section.py

@@ -0,0 +1,43 @@
+# @Author  : lightXu
+# @File    : info_section.py
+# @Time    : 2019/4/2 0002 下午 15:38
+import cv2
+from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate_in_google_format
+from segment.sheet_resolve.tools.utils import crop_region, read_xml_to_json, read_single_img
+
+
+info_section_class = ['alarm_info',
+                      'info_title',
+                      'attention',
+                      'page',
+                      'full_filling',
+                      'print_info',
+                      'ban_area',
+                      'type_score',
+                      'time',
+                      'total_score',
+                      'executor',
+                      'verify']
+
+
+def get_text(sheet, raw_image):
+    for ele in sheet['regions']:
+        if ele['class_name'] in info_section_class:
+            bbox = ele['bounding_box']
+            img_region = crop_region(raw_image, bbox)
+            try:
+                text_dict = get_ocr_text_and_coordinate_in_google_format(img_region)
+                text_list = text_dict['chars']
+                text = ''.join(text_list)
+                ele['text'] = text
+            except Exception:
+                ele['text'] = ''
+    return sheet
+
+
+# if __name__ == '__main__':
+#     xml_path = r'C:\Users\Administrator\Desktop\test\third_raw\010515.xml'
+#     jpg_path = r'C:\Users\Administrator\Desktop\test\third_raw\010515.jpg'
+#     sheet_dict = read_xml_to_json(xml_path)
+#     image = read_single_img(jpg_path)
+#     get_text(sheet_dict, image)

+ 466 - 0
segment/sheet_resolve/analysis/resolve.py

@@ -0,0 +1,466 @@
+# @Author  : lightXu
+# @File    : resolve.py
+# @Time    : 2018/12/3 0003 上午 10:16
+
+import time
+import traceback
+import xml.etree.cElementTree as ET
+from django.conf import settings
+import segment.logging_config as logging
+import segment.sheet_resolve.analysis.choice.analysis_choice as resolve_choice
+import segment.sheet_resolve.analysis.choice.choice_box as choice_box
+import segment.sheet_resolve.analysis.choice.choice_line_box as choice_line_box
+import segment.sheet_resolve.analysis.cloze.analysis_cloze as resolve_cloze
+import segment.sheet_resolve.analysis.cloze.cloze_line_box as resolve_cloze_line_box
+import segment.sheet_resolve.analysis.exam_number.exam_number_box as resolve_exam_number_box
+import segment.sheet_resolve.analysis.exam_number.exam_number_row_column as exam_number_row_column
+import segment.sheet_resolve.analysis.sheet.analysis_sheet as resolve_sheet
+import segment.sheet_resolve.analysis.solve.mark_box as resolve_mark_box
+import segment.sheet_resolve.analysis.solve.mark_line_box as resolve_mark_line_box
+from segment.sheet_resolve.tools import utils
+from segment.sheet_resolve.tools.tf_sess import TfSess
+from segment.sheet_resolve.tools.tf_settings import xml_template_path, model_dict
+from segment.sheet_resolve.tools.utils import read_single_img, read_xml_to_json, create_xml
+from segment.sheet_resolve.analysis.sheet.sheet_adjust import adjust_item_edge_by_gray_image
+from segment.sheet_resolve.analysis.sheet.sheet_infer import infer_bar_code, box_infer_and_complete
+from segment.sheet_resolve.analysis.sheet.sheet_infer import infer_exam_number, adjust_exam_number, exam_number_infer_by_s
+from segment.sheet_resolve.analysis.sheet.choice_infer import infer_choice_m
+
+logger = logging.getLogger(settings.LOGGING_TYPE)
+
+
+sheet_infer_dict = dict(bar_code=True,
+                        choice_m=True,
+                        exam_number=True,
+                        common_sheet=True)
+infer_choice_m_flag = False
+
+
+def sheet(series_number, image_path, image, conf_thresh, mns_thresh, subject, sheet_sess, ocr=''):
+    global infer_choice_m_flag
+    model_type = subject
+    classes = list(model_dict[model_type]['classes'])
+    coordinate_bias_dict = model_dict[model_type]['class_coordinate_bias']
+
+    if '_blank' in model_type:
+        model_type = model_type.replace("_blank", "")
+
+    sheets_dict = resolve_sheet.get_single_image_sheet_regions(model_type, image_path, image, classes,
+                                                               sheet_sess.sess, sheet_sess.net,
+                                                               conf_thresh, mns_thresh, coordinate_bias_dict)
+
+    h, w = image.shape[0], image.shape[1]
+    regions = sheets_dict['regions']
+    fetched_class = [ele['class_name'] for ele in regions]
+
+    try:
+        regions = adjust_item_edge_by_gray_image(image, regions)
+    except Exception as e:
+        traceback.print_exc()
+        logger.info('试卷:{} 自适应边框失败: {}'.format(image_path, e))
+
+    if sheet_infer_dict['bar_code']:
+        try:
+            if ('bar_code' not in fetched_class) and ocr:
+                attention_region = [ele for ele in regions if ele['class_name'] == 'attention']
+                bar_code_list = infer_bar_code(image, ocr, attention_region)
+                regions.extend(bar_code_list)
+        except Exception as e:
+            traceback.print_exc()
+            logger.info('试卷:{} 条形码推断失败: {}'.format(image_path, e))
+
+    if sheet_infer_dict['exam_number']:
+        try:
+            cond1 = 'exam_number' in fetched_class
+            tmp = ['info_title', 'qr_code', 'bar_code', 'choice', 'choice_m', 'exam_number_w']
+            cond2 = True in [True for ele in tmp if ele in fetched_class]  # 第一面特征
+            cond3 = 'exam_number_w' in fetched_class
+            cond4 = 'exam_number_s' in fetched_class
+
+            if cond1 and cond3 and not cond4:
+                regions = adjust_exam_number(regions)
+            if not cond1 and cond4:
+                exam_number_list = exam_number_infer_by_s(image, regions)
+                regions.extend(exam_number_list)
+
+            if not cond1 and not cond4 and cond2 and ocr:
+                exam_number_list = infer_exam_number(image, ocr, regions)
+                regions.extend(exam_number_list)
+
+        except Exception as e:
+            traceback.print_exc()
+            logger.info('试卷:{} 考号推断失败: {}'.format(image_path, e))
+
+    if sheet_infer_dict['choice_m']:
+
+        try:
+            choice_m_list = infer_choice_m(image, regions, ocr)
+            remain_choice_m = []
+            if len(choice_m_list) > 0:
+                choice_m_old_list = [ele for ele in regions if 'choice_m' == ele['class_name']]
+                for infer_box in choice_m_list.copy():
+                    infer_loc = infer_box['bounding_box']
+
+                    for tf_box in choice_m_old_list:
+                        tf_loc = tf_box['bounding_box']
+                        iou = utils.cal_iou(infer_loc, tf_loc)
+                        if iou[0] > 0.85 or iou[1] > 0.85:
+                            if infer_box not in remain_choice_m:
+                                remain_choice_m.append(infer_box)
+                                choice_m_list.remove(infer_box)
+                            regions.remove(tf_box)
+                            break
+                        elif iou[0] > 0:
+                            choice_m_list.remove(infer_box)
+                            break
+
+                remain_choice_m.extend(choice_m_list)
+
+                # regions = [ele for ele in regions if 'choice_m' != ele['class_name']]
+                regions.extend(remain_choice_m)
+                infer_choice_m_flag = True
+
+        except Exception as e:
+            traceback.print_exc()
+            logger.info('试卷:{} 选择题推断失败: {}'.format(image_path, e))
+
+    if sheet_infer_dict['common_sheet']:
+
+        try:
+            regions = box_infer_and_complete(image, regions, ocr)
+        except Exception as e:
+            traceback.print_exc()
+            logger.info('试卷:{} 识别框补全推断失败: {}'.format(image_path, e))
+
+    try:
+        adjust_regions = adjust_item_edge_by_gray_image(image, regions)
+    except Exception as e:
+        adjust_regions = regions
+
+        traceback.print_exc()
+        logger.info('试卷:{} 自适应边框失败: {}'.format(image_path, e))
+
+    sheets_dict.update({'regions': adjust_regions})
+
+    #  generate xml
+    tree = ET.parse(xml_template_path)
+    xml_save_path = sheets_dict['img_name'].replace('.jpg', '.xml')
+    root = tree.getroot()
+    series = ET.SubElement(root, 'paper_id')
+    series.text = series_number
+
+    img_shape = image.shape
+    project = ET.SubElement(root, 'size', {})
+    width = ET.SubElement(project, 'width')
+    width.text = str(img_shape[1])
+    height = ET.SubElement(project, 'height')
+    height.text = str(img_shape[0])
+    depth = ET.SubElement(project, 'depth')
+    if len(img_shape) >= 3:
+        depth.text = '3'
+    else:
+        depth.text = '1'
+
+    for ele in regions:
+        name = ele['class_name']
+        xmin = ele['bounding_box']['xmin']
+        ymin = ele['bounding_box']['ymin']
+        xmax = ele['bounding_box']['xmax']
+        ymax = ele['bounding_box']['ymax']
+        tree = create_xml(name, tree, xmin, ymin, xmax, ymax)
+
+    tree.write(xml_save_path)
+    return sheets_dict, xml_save_path
+
+
+def choice(image, regions, xml_path, conf_thresh, mns_thresh, choice_sess):
+    model_type = 'choice'
+    classes = model_dict[model_type]['classes']
+    coordinate_bias_dict = model_dict[model_type]['class_coordinate_bias']
+
+    choice_list = []
+    for ele in regions:
+        if ele["class_name"] == 'choice':
+
+            choice_bbox = ele['bounding_box']
+            left = choice_bbox['xmin']
+            top = choice_bbox['ymin']
+            choice_img = utils.crop_region(image, choice_bbox)
+
+            choice_dict_tf = resolve_choice. \
+                get_single_image_sheet_regions('choice', choice_img, classes,
+                                               choice_sess.sess, choice_sess.net, conf_thresh, mns_thresh,
+                                               coordinate_bias_dict)
+
+            choice_list = choice_list + choice_line_box.choice_line(left, top, choice_img, choice_dict_tf, xml_path)
+
+    return choice_list
+
+
+def choice_row_col(image, regions, xml_path, conf_thresh, mns_thresh, choice_sess):
+    model_type = 'choice_m'
+    classes = model_dict[model_type]['classes']
+    coordinate_bias_dict = model_dict[model_type]['class_coordinate_bias']
+
+    choice_list = []
+    for ele in regions:
+        if ele["class_name"] == 'choice':
+
+            choice_box = ele['bounding_box']
+            left = choice_box['xmin']
+            top = choice_box['ymin']
+            choice_img = utils.crop_region(image, choice_box)
+
+            choice_m_dict_tf = resolve_choice. \
+                get_single_image_sheet_regions('choice_m', choice_img, classes,
+                                               choice_sess.sess, choice_sess.net, conf_thresh, mns_thresh,
+                                               coordinate_bias_dict)
+
+            choice_list = choice_list + choice_line_box.choice_line_with_number(left, top, choice_img, choice_m_dict_tf, xml_path)
+
+    return choice_list
+
+
+def choice_m_row_col(image, regions, xml_path):
+
+    choice_m_dict_tf = [ele for ele in regions if ele['class_name'] == 'choice_m']
+    # choice_m_row_col_with_number
+    choice_list = []
+    try:
+        # choice_list = choice_box.get_number_by_enlarge_choice_m(image, choice_m_dict_tf, xml_path)
+        # if infer_choice_m_flag:
+        #     choice_list = choice_line_box.choice_m_adjust(image, choice_m_dict_tf)
+        #
+        # else:
+        #     choice_list = choice_line_box.choice_m_row_col(image, choice_m_dict_tf, xml_path)  # 找选择题行列、分数
+
+        choice_list = choice_line_box.choice_m_row_col(image, choice_m_dict_tf, xml_path)  # 找选择题行列、分数
+        tree = ET.parse(xml_path)  # xml tree
+        for index_num, box in enumerate(choice_list):
+            if len(box['bounding_box']) > 0:
+                abcd = box['bounding_box']
+                number = str(box['number'])
+                name = '{}_{}*{}_{}_{}'.format('choice_m', box['rows'], box['cols'], box['direction'], number)
+                tree = utils.create_xml(name, tree,
+                                        abcd['xmin'], abcd['ymin'],
+                                        abcd['xmax'], abcd['ymax'])
+
+        tree.write(xml_path)
+    except Exception as e:
+        traceback.print_exc()
+        print(e)
+
+    return choice_list
+
+
+def exam_number(image, regions, xml_path):
+    exam_number_dict = {}
+    for ele in regions:
+        if ele["class_name"] == 'exam_number':
+            exam_number_dict = ele
+
+    exam_number_box = exam_number_dict['bounding_box']
+    left = exam_number_box['xmin']
+    top = exam_number_box['ymin']
+    exam_number_img = utils.crop_region(image, exam_number_box)
+
+    # exam_number_dict = resolve_exam_number_box.exam_number(left, top, exam_number_img, xml_path)
+    exam_number_dict = resolve_exam_number_box.exam_number_whole(left, top, exam_number_img, xml_path)
+
+    # print(exam_number_dict)
+    return exam_number_dict
+
+
+def exam_number_row_col(image, regions, xml_path):
+    exam_number_dict = {}
+    for ele in regions:
+        if ele["class_name"] == 'exam_number':
+            exam_number_dict = ele
+
+    exam_number_box = exam_number_dict['bounding_box']
+    left = exam_number_box['xmin']
+    top = exam_number_box['ymin']
+    exam_number_img = utils.crop_region(image, exam_number_box)
+
+    exam_number_row_col_dict = exam_number_row_column.get_exam_number_row_and_col(left, top, exam_number_img)
+
+    tree = ET.parse(xml_path)  # xml tree
+    if len(exam_number_row_col_dict) > 0:
+        exam_number_box = exam_number_row_col_dict['bounding_box']
+        name = '{}_{}*{}_{}'.format('exam_number',
+                                    exam_number_row_col_dict['rows'],
+                                    exam_number_row_col_dict['cols'],
+                                    exam_number_row_col_dict['direction'])
+        tree = utils.create_xml(name, tree,
+                                exam_number_box['xmin'], exam_number_box['ymin'],
+                                exam_number_box['xmax'], exam_number_box['ymax'])
+
+    else:
+        tree = utils.create_xml('exam_number', tree,
+                                exam_number_box['xmin'], exam_number_box['ymin'],
+                                exam_number_box['xmax'], exam_number_box['ymax'])
+        exam_number_row_col_dict = {}
+
+    tree.write(xml_path)
+
+    return [exam_number_row_col_dict]
+
+
+def cloze(image, regions, xml_path, conf_thresh, mns_thresh, cloze_sess):
+    classes = model_dict['cloze']['classes']
+    coordinate_bias_dict = model_dict['cloze']['class_coordinate_bias']
+
+    cloze_list = []
+    for ele in regions:
+        if ele["class_name"] == 'cloze':
+            cloze_box = ele['bounding_box']
+            left = cloze_box['xmin']
+            top = cloze_box['ymin']
+            cloze_img = utils.crop_region(image, cloze_box)
+            cloze_dict_tf = resolve_cloze.get_single_image_sheet_regions('cloze', cloze_img, classes,
+                                                                         cloze_sess.sess, cloze_sess.net, conf_thresh,
+                                                                         mns_thresh, coordinate_bias_dict)
+            cloze_list = cloze_list + resolve_cloze_line_box.cloze_line(left, top, cloze_img, cloze_dict_tf['regions'], xml_path)
+
+    return cloze_list
+
+
+def solve_with_mark(image, regions, xml_path):
+    solve_list = []
+    mark_list = []
+    for ele in regions.copy():
+        if 'solve' in ele["class_name"]:
+            exam_number_box = ele['bounding_box']
+            left = exam_number_box['xmin']
+            top = exam_number_box['ymin']
+            exam_number_img = utils.crop_region(image, exam_number_box)
+            solve_mark_dict = resolve_mark_box.solve_mark(left, top, exam_number_img, xml_path)
+            if len(solve_mark_dict) > 0:
+                ele['class_name'] = 'solve_'+str(solve_mark_dict['number'])
+                solve_list.append(ele)
+                mark_list.append(solve_mark_dict)
+
+    return solve_list, mark_list
+
+
+def solve(image, regions, xml_path):
+    solve_list = []
+    tree = ET.parse(xml_path)
+    for ele in regions.copy():
+        if 'solve' in ele["class_name"]:
+            exam_number_box = ele['bounding_box']
+            exam_number_img = utils.crop_region(image, exam_number_box)
+            number = resolve_mark_line_box.solve_line(exam_number_img)
+            solve_dict = {'number': number, 'location': exam_number_box, 'default_points': 12}
+            solve_list.append(solve_dict)
+
+            tree = utils.create_xml(str(number), tree,
+                                    exam_number_box['xmin'], exam_number_box['ymin'],
+                                    exam_number_box['xmax'], exam_number_box['ymax'])
+    tree.write(xml_path)
+    return solve_list
+
+
+def solve_with_number(regions, xml_path):
+    solve_list = []
+    for ele in regions:
+        if 'solve' in ele["class_name"] or 'composition' in ele["class_name"]:
+            solve_dict = {'number': -1, 'default_points': -1}
+            ele.update(solve_dict)
+            solve_list.append(ele)
+
+    tree = ET.parse(xml_path)  # xml tree
+    for index_num, box in enumerate(solve_list):
+        if len(box['bounding_box']) > 0:
+            abcd = box['bounding_box']
+            number = str(box['number'])
+            default_points = box["default_points"]
+            name = '{}_{}_{}'.format(box["class_name"], number, default_points)
+            tree = utils.create_xml(name, tree,
+                                    abcd['xmin'], abcd['ymin'],
+                                    abcd['xmax'], abcd['ymax'])
+
+    tree.write(xml_path)
+    return solve_list
+
+
+def cloze_with_number(regions, xml_path):
+    cloze_list = []
+    for ele in regions:
+        if 'cloze' == ele["class_name"] or "cloze_s" == ele["class_name"]:
+            cloze_dict = {'number': -1, 'default_points': -1}
+            ele.update(cloze_dict)
+            cloze_list.append(ele)
+
+    tree = ET.parse(xml_path)  # xml tree
+    for index_num, box in enumerate(cloze_list):
+        if len(box['bounding_box']) > 0:
+            abcd = box['bounding_box']
+            number = str(box['number'])
+            default_points = box["default_points"]
+            name = '{}_{}_{}'.format(box["class_name"], number, default_points)
+            tree = utils.create_xml(name, tree,
+                                    abcd['xmin'], abcd['ymin'],
+                                    abcd['xmax'], abcd['ymax'])
+
+    tree.write(xml_path)
+    return cloze_list
+
+
+def make_together(image_path):
+
+    sheet_sess = TfSess('sheet')
+    choice_sess = TfSess('choice')
+    cloze_sess = TfSess('cloze')
+
+    raw_img = read_single_img(image_path)
+    conf_thresh_0 = 0.7
+    mns_thresh_0 = 0.3
+
+    series_number = 123456789
+    subject = 'english'
+    sheets_dict_0, xml_save_path = sheet(series_number, image_path, raw_img, conf_thresh_0, mns_thresh_0, subject, sheet_sess)
+    # 手动修改faster_rcnn识别生成的框
+
+    sheets_dict_0 = read_xml_to_json(xml_save_path)
+    regions = sheets_dict_0['regions']
+    classes_name = str([ele['class_name'] for ele in regions])
+
+    if 'choice' in classes_name:
+        try:
+            sheets_dict_0['choice'] = choice(raw_img, regions, xml_save_path, conf_thresh_0, mns_thresh_0, choice_sess)
+        except Exception:
+            traceback.print_exc()
+
+    if 'exam_number' in classes_name:
+        try:
+            sheets_dict_0['exam_number'] = exam_number(raw_img, regions, xml_save_path)
+        except Exception:
+            traceback.print_exc()
+
+    if 'cloze' in classes_name:
+        try:
+            sheets_dict_0['cloze'] = cloze(raw_img, regions, xml_save_path, conf_thresh_0, mns_thresh_0, cloze_sess)
+        except Exception:
+            traceback.print_exc()
+
+    if 'solve' in classes_name:
+        try:
+            solve_list, mark_list = solve(raw_img, regions, xml_save_path,)
+            sheets_dict_0['solve'] = solve_list
+            sheets_dict_0['mark'] = mark_list
+        except Exception:
+            traceback.print_exc()
+
+    # print(sheets_dict_0)
+    return sheets_dict_0
+
+
+# if __name__ == '__main__':
+#     start_time = time.time()
+#
+#     image_path_0 = os.path.join(r'C:\Users\Administrator\Desktop\sheet\correct\back_sizes\template',
+#                                 '20180719004308818_0020.jpg')
+#     make_together(image_path_0)
+#     end_time = time.time()
+#     print('time cost: ', (end_time - start_time))

+ 3 - 0
segment/sheet_resolve/analysis/sheet/__init__.py

@@ -0,0 +1,3 @@
+# @Author  : lightXu
+# @File    : __init__.py.py
+# @Time    : 2018/11/21 0021 下午 16:01

+ 270 - 0
segment/sheet_resolve/analysis/sheet/analysis_sheet.py

@@ -0,0 +1,270 @@
+# @Author  : lightXu
+# @File    : analysis_sheet.py
+import time
+import os
+import traceback
+
+import numpy as np
+import cv2
+
+from segment.sheet_resolve.lib.model.test import im_detect
+from segment.sheet_resolve.lib.model.nms_wrapper import nms
+from segment.sheet_resolve.lib.utils.timer import Timer
+from segment.sheet_resolve.tools import utils
+from segment.sheet_resolve.analysis.solve.optional_solve import find_contours, resolve_optional_choice
+
+
+def analysis_single_image_with_regions(analysis_type, classes,
+                                       sess, net,
+                                       im_raw, conf_thresh, mns_thresh,
+                                       coordinate_bias_dict):
+    """Detect object classes in an image using pre-computed object proposals."""
+
+    size = im_raw.shape
+
+    # Detect all object classes and regress object bounds
+    timer = Timer()
+    timer.tic()
+    if analysis_type in ['unknown_subject', 'math', 'math_zxhx', 'english', 'chinese',
+                         'physics', 'chemistry', 'biology', 'politics', 'history',
+                         'geography', 'science_comprehensive', 'arts_comprehensive'
+                         ]:
+        analysis_type = 'sheet'
+    im, ratio = utils.img_resize(analysis_type, im_raw)
+    scores, boxes = im_detect(analysis_type, sess, net, im)
+    timer.toc()
+    print('Detection took {:.3f}s for {:d} object proposals'.format(timer.total_time, boxes.shape[0]))
+
+    content_list = []
+    analysis_cls_list = []
+    qr_code_info = 'Nan'
+
+    for cls_ind, cls in enumerate(classes[1:]):  # classes
+        cls_ind += 1  # because we skipped background
+        cls_boxes = boxes[:, 4 * cls_ind:4 * (cls_ind + 1)]
+        cls_scores = scores[:, cls_ind]
+        dets = np.hstack((cls_boxes,
+                          cls_scores[:, np.newaxis])).astype(np.float32)
+        keep = nms(dets, mns_thresh)
+        dets = dets[keep, :]
+        # vis_detections(im, cls, dets, ax, thresh=conf_thresh)
+        inds = np.where(dets[:, -1] >= conf_thresh)[0]
+        if len(inds) > 0:
+            if cls in list(coordinate_bias_dict.keys()):
+                xmin_bias = coordinate_bias_dict[cls]['xmin_bias']
+                ymin_bias = coordinate_bias_dict[cls]['ymin_bias']
+                xmax_bias = coordinate_bias_dict[cls]['xmax_bias']
+                ymax_bias = coordinate_bias_dict[cls]['ymax_bias']
+            else:
+                xmin_bias = 0
+                ymin_bias = 0
+                xmax_bias = 0
+                ymax_bias = 0
+            for i in inds:
+                bbox = dets[i, :4]
+                score = '{:.4f}'.format(dets[i, -1])
+
+                xmin = int(int(bbox[0]) * ratio[0]) + xmin_bias
+                ymin = int(int(bbox[1]) * ratio[1]) + ymin_bias
+                xmax = int(int(bbox[2]) * ratio[0]) + xmax_bias
+                ymax = int(int(bbox[3]) * ratio[1]) + ymax_bias
+
+                xmin = (xmin if (xmin > 0) else 1)
+                ymin = (ymin if (ymin > 0) else 1)
+                xmax = (xmax if (xmax < size[1]) else size[1] - 1)
+                ymax = (ymax if (ymax < size[0]) else size[0] - 1)
+
+                if cls in ['solve0', ]:
+                    cls = 'solve'
+
+                bbox_dict = {"xmin": xmin, "ymin": ymin, "xmax": xmax, "ymax": ymax}
+                # class_dict = {"class_name": cls, "bounding_box": bbox_dict, "score": score}
+                class_dict = {"class_name": cls, "bounding_box": bbox_dict}
+
+                # if cls == 'qr_code':
+                #     qr_img = utils.crop_region(im_raw, bbox_dict)
+                #     qr_path = r'./qr_code.jpg'
+                #     cv2.imwrite(qr_path, qr_img)
+                #     qr_code_info = utils.check_qr_code_with_region_img(qr_path)
+                #     os.remove(qr_path)
+
+                content_list.append(class_dict)
+
+    return content_list, analysis_cls_list, qr_code_info
+
+
+def get_single_image_sheet_regions(analysis_type, img_path, img, classes,
+                                   sess, net, conf_thresh, mns_thresh,
+                                   coordinate_bias_dict):
+    start_time = time.time()
+
+    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
+    print('analysis for JPG {}'.format(img_path))
+
+    content, cls, qr_code_info = \
+        analysis_single_image_with_regions(analysis_type, classes, sess, net,
+                                           img, conf_thresh, mns_thresh,
+                                           coordinate_bias_dict)
+
+    img_dict = {"img_name": img_path,
+                # 'qr_code': qr_code_info,
+                'subject': analysis_type,
+                "regions": content,
+                }
+
+    end_time = time.time()
+    print(end_time - start_time)
+
+    return img_dict
+
+
+def question_number_format(init_number, crt_numbers, sheet_dict):
+    for region in sheet_dict['regions']:
+        numbers = region.get("number")
+        if numbers and isinstance(numbers, int):
+            if numbers <= 0 or numbers in crt_numbers:
+                numbers = init_number
+                crt_numbers.append(numbers)
+                init_number += 1
+            region.update({"number": numbers})
+            crt_numbers.append(numbers)
+        if numbers and isinstance(numbers, list):
+            for i, num in enumerate(numbers):
+                if num <= 0 or num in crt_numbers:
+                    numbers[i] = init_number
+                    crt_numbers.append(init_number)
+                    init_number += 1
+
+            region.update({"number": numbers})
+            crt_numbers.extend(numbers)
+
+    return sheet_dict, init_number, crt_numbers
+
+
+def box_region_format(sheet_dict, image, subject, shrink=True):
+    include_class = ['anchor_point',
+                     'bar_code',
+                     'choice_m',
+                     'cloze',
+                     'cloze_s',
+                     'exam_number_col_row',
+                     'optional_choice',
+                     'optional_solve',
+                     # 'qr_code',
+                     'solve',
+                     'optional_solve',
+                     'composition',
+                     # 'correction'
+                     ]
+
+    sheet_regions = sheet_dict['regions']
+    optional_solve_tmp = []
+    default_points_dict = {'choice_m': 5, "cloze": 5, 'solve': 12, 'cloze_s': 5, "composition": 60}
+    if subject == "english":
+        default_points_dict = {'choice_m': 2, "cloze": 2, 'solve': 2, 'cloze_s': 2, "composition": 25}
+
+    for i in range(len(sheet_regions) - 1, -1, -1):
+        if subject == "math":
+            if sheet_regions[i]['class_name'] == 'cloze':
+                sheet_regions[i]['class_name'] = 'cloze_big'  # math exclude cloze big
+            if sheet_regions[i]['class_name'] == 'cloze_s':
+                sheet_regions[i]['class_name'] = 'cloze'  # math exclude cloze big
+        if subject == "english":
+            if sheet_regions[i]['class_name'] == 'solve':
+                sheet_regions[i]['class_name'] = 'cloze'
+            if sheet_regions[i]['class_name'] == 'correction':
+                sheet_regions[i]['class_name'] = 'solve'
+
+    for i in range(len(sheet_regions) - 1, -1, -1):
+        if sheet_regions[i]['class_name'] in ['solve0']:
+            sheet_regions[i]['class_name'] = 'solve'
+        if sheet_regions[i]['class_name'] in ['composition0']:
+            sheet_regions[i]['class_name'] = 'composition'
+
+        if sheet_regions[i]['class_name'] == 'select_s':
+            sheet_regions[i]['class_name'] = 'optional_choice'
+            optional_solve_tmp.append(sheet_regions[i])
+            sheet_regions.pop(i)
+
+        if shrink:
+            if sheet_regions[i]['class_name'] not in include_class:
+                sheet_regions.pop(i)
+
+    for ele in sheet_regions:
+        if ele['class_name'] == 'solve':
+            solve_box = (ele['bounding_box']['xmin'], ele['bounding_box']['ymin'],
+                         ele['bounding_box']['xmax'], ele['bounding_box']['ymax'])
+            for optional_solve in optional_solve_tmp:
+                optional_solve_box = (optional_solve['bounding_box']['xmin'], optional_solve['bounding_box']['ymin'],
+                                      optional_solve['bounding_box']['xmax'], optional_solve['bounding_box']['ymax'])
+                if utils.decide_coordinate_contains(optional_solve_box, solve_box):
+                    ele['class_name'] = 'optional_solve'
+                    break
+                else:
+                    continue
+
+        if ele['class_name'] == "composition":
+            if isinstance(ele['default_points'], list):
+                for i, dp in enumerate(ele['default_points']):
+                    if dp != default_points_dict[ele['class_name']]:
+                        ele['default_points'][i] = default_points_dict[ele['class_name']]
+
+            if isinstance(ele['default_points'], int):
+                if ele['default_points'] != default_points_dict[ele['class_name']]:
+                    ele['default_points'] = default_points_dict[ele['class_name']]
+
+        if ele['class_name'] in ["choice_m", "cloze", "cloze_s", "solve"]:
+            if isinstance(ele['default_points'], list):
+                for i, dp in enumerate(ele['default_points']):
+                    if dp == -1:
+                        ele['default_points'][i] = default_points_dict[ele['class_name']]
+
+            if isinstance(ele['default_points'], int):
+                if ele['default_points'] == -1:
+                    ele['default_points'] = default_points_dict[ele['class_name']]
+
+    for ele in optional_solve_tmp:  # 选做题
+        bbox = ele['bounding_box']
+        box_region = utils.crop_region(image, bbox)
+        left = bbox['xmin']
+        top = bbox['ymin']
+        right = bbox['xmax']
+        bottom = bbox['ymax']
+
+        if (right - left) >= (bottom-top):
+            direction = 180
+        else:
+            direction = 90
+
+        # res = find_contours(left, top, box_region)
+        try:
+            res = resolve_optional_choice(left, top, direction, box_region)
+        except Exception as e:
+            res = {'rows': 1, 'cols': 2,
+                   'option': 'A, B',
+                   'single_width': (right - left) // 3,
+                   'single_height': bottom - top,
+                   'bounding_box': {'xmin': left,
+                                    'ymin': top,
+                                    'xmax': right,
+                                    'ymax': bottom}}
+        res['class_name'] = 'optional_choice'
+
+        sheet_regions.append(res)
+
+    # iou
+    sheet_tmp = sheet_regions.copy()
+    remove_index = []
+    for i, region in enumerate(sheet_tmp):
+        if i not in remove_index:
+            box = region['bounding_box']
+            for j, region_in in enumerate(sheet_tmp):
+                box_in = region_in['bounding_box']
+                iou = utils.cal_iou(box, box_in)
+                if iou[0] > 0.75 and i != j:
+                    sheet_regions.remove(region)
+                    remove_index.append(j)
+                    break
+
+    sheet_dict.update({'regions': sheet_regions})
+    return sheet_dict

+ 671 - 0
segment/sheet_resolve/analysis/sheet/choice_infer.py

@@ -0,0 +1,671 @@
+# @Author  : lightXu
+# @File    : choice_infer.py
+import os
+import traceback
+import time
+import random
+from django.conf import settings
+from segment.sheet_resolve.tools import utils, brain_api
+from itertools import chain
+import re
+import numpy as np
+import cv2
+import xml.etree.cElementTree as ET
+from segment.sheet_resolve.tools.utils import crop_region_direct, create_xml, infer_number, combine_char_in_raw_format
+from sklearn.cluster import DBSCAN
+from segment.sheet_resolve.analysis.sheet.ocr_sheet import ocr2sheet
+
+
+def get_split_index(array, dif=0):
+    array = np.array(array)
+    interval_list = np.abs(array[1:] - array[:-1])
+    split_index = [0]
+    for i, interval in enumerate(interval_list):
+        if dif:
+            split_dif = dif
+        else:
+            split_dif = np.mean(interval_list)
+        if interval > split_dif:
+            split_index.append(i + 1)
+
+    split_index.append(len(array))
+    split_index = sorted(list(set(split_index)))
+    return split_index
+
+
+def adjust_choice_m(image, xe, ye):
+    dilate = 1
+    blur = 5
+
+    # Convert to gray
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+    if blur != 0:
+        image = cv2.GaussianBlur(image, (blur, blur), 0)
+
+    # Apply threshold to get image with only b&w (binarization)
+    image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+
+    kernel = np.ones((ye, xe), np.uint8)  # y轴膨胀, x轴膨胀
+
+    dst = cv2.dilate(image, kernel, iterations=1)
+
+    (major, minor, _) = cv2.__version__.split(".")
+    contours = cv2.findContours(dst, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cnts = contours[0] if int(major) > 3 else contours[1]
+
+    # _, cnts, hierarchy = cv2.findContours(dst, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    right_limit = 0
+    bottom_limit = 0
+    for cnt_id, cnt in enumerate(reversed(cnts)):
+        x, y, w, h = cv2.boundingRect(cnt)
+        if x + w > right_limit:
+            right_limit = x + w
+
+        if y + h > bottom_limit:
+            bottom_limit = y + h
+
+    return right_limit, bottom_limit
+
+
+def find_digital(ocr_raw_list):
+    pattern = r'\d+'
+    x_list = []
+    y_list = []
+    digital_list = list()
+    chars_list = list()
+    height_list, width_list = list(), list()
+    ocr_dict_list = combine_char_in_raw_format(ocr_raw_list)
+    for i, ele in enumerate(ocr_dict_list):
+        words = ele['words']
+        words = words.replace(' ', '').upper()  # 去除空格
+
+        digital_words_m = re.finditer(pattern, words)
+        digital_index_list = [(m.group(), m.span()) for m in digital_words_m if m]
+        chars_index = [ele for ele in range(0, len(ele['chars']))]
+        digital_index_detail_list = []
+        for letter_info in digital_index_list:
+            number = letter_info[0]
+            index_start = letter_info[1][0]
+            index_end = letter_info[1][1] - 1
+            char_start = ele['chars'][index_start]
+            char_end = ele['chars'][index_end]
+
+            if index_start == index_end:
+                digital_index_detail_list += [index_start]
+            else:
+                digital_index_detail_list += chars_index[index_start:index_end + 1]
+
+            letter_loc_xmin = int(char_start['location']['left'])
+            letter_loc_ymin = min(int(char_start['location']['top']), int(char_end['location']['top']))
+            letter_loc_xmax = int(char_end['location']['left']) + int(char_end['location']['width'])
+            letter_loc_ymax = max(int(char_start['location']['top']) + int(char_start['location']['height']),
+                                  int(char_end['location']['top']) + int(char_end['location']['height']))
+
+            mid_x = letter_loc_xmin + (letter_loc_xmax - letter_loc_xmin) // 2
+            mid_y = letter_loc_ymin + (letter_loc_ymax - letter_loc_ymin) // 2
+
+            # print(number, (mid_x, mid_y))
+            x_list.append(mid_x)
+            y_list.append(mid_y)
+
+            height_list.append(letter_loc_ymax - letter_loc_ymin)
+            width_list.append(letter_loc_xmax - letter_loc_xmin)
+
+            number_loc = (letter_loc_xmin, letter_loc_ymin, letter_loc_xmax, letter_loc_ymax, mid_x, mid_y)
+            digital_list.append({"digital": int(number), "loc": number_loc})
+
+        current_chars = [char for index, char in enumerate(ele['chars'])
+                         if index not in digital_index_detail_list and char['char'] not in ['.', ',', '。', '、']]
+
+        chars_list += current_chars
+    d_mean_height = sum(height_list) // len(height_list)
+    d_mean_width = sum(width_list) // len(width_list)
+
+    # mean_height = max(height_list)
+    # mean_width = max(width_list)
+    # print(x_list)
+    # print(y_list)
+    return digital_list, chars_list, d_mean_height, d_mean_width
+
+
+def cluster2choice_m_(cluster_list, m_h, m_w):
+    numbers = [ele['digital'] for ele in cluster_list]
+
+    loc_top_interval = (np.array([ele['loc'][3] for ele in cluster_list][1:]) -
+                        np.array([ele['loc'][3] for ele in cluster_list][:-1]))
+
+    split_index = [0]
+    for i, interval in enumerate(loc_top_interval):
+        if interval > m_h * 1.5:
+            split_index.append(i + 1)
+
+    split_index.append(len(cluster_list))
+    split_index = sorted(list(set(split_index)))
+    block_list = []
+    for i in range(len(split_index) - 1):
+        block = cluster_list[split_index[i]: split_index[i + 1]]
+
+        xmin = min([ele["loc"][0] for ele in block])
+        ymin = min([ele["loc"][1] for ele in block])
+        xmax = max([ele["loc"][2] for ele in block])
+        ymax = max([ele["loc"][3] for ele in block])
+
+        numbers = [ele['digital'] for ele in block]
+
+        choice_m = {"number": numbers, "loc": (xmin, ymin, xmax, ymax)}
+        block_list.append(choice_m)
+
+    return block_list
+
+
+def cluster2choice_m(cluster_list, mean_width):
+    # 比较x坐标,去掉误差值
+    numbers_x = [ele['loc'][4] for ele in cluster_list]
+    numbers_x_array = np.array(numbers_x)
+    numbers_x_interval = np.abs((numbers_x_array[1:] - numbers_x_array[:-1]))
+    error_index_superset = np.where(numbers_x_interval >= mean_width)[0]
+    error_index_superset_interval = error_index_superset[1:] - error_index_superset[:-1]
+    t_index = list(np.where(error_index_superset_interval > 1)[0] + 1)
+    t_index.insert(0, 0)
+    t_index.append(len(error_index_superset))
+    error = []
+    for i in range(0, len(t_index) - 1):
+        a = t_index[i]
+        b = t_index[i + 1]
+        block = list(error_index_superset[a: b])
+        error += block[1:]
+
+    cluster_list = [ele for i, ele in enumerate(cluster_list) if i not in error]
+    numbers = [ele['digital'] for ele in cluster_list]
+    numbers_array = np.array(numbers)
+
+    # numbers_y = [ele['loc'][5] for ele in cluster_list]
+    # numbers_y_array = np.array(numbers_y)
+    # numbers_y_interval = np.abs((numbers_y_array[1:] - numbers_y_array[:-1]))
+    # split_index = [0]
+    # for i, interval in enumerate(numbers_y_interval):
+    #     if interval > np.mean(numbers_y_interval):
+    #         split_index.append(i + 1)
+    #
+    # split_index.append(len(cluster_list))
+    # split_index = sorted(list(set(split_index)))
+    # for i in range(len(split_index) - 1):
+    #     block = cluster_list[split_index[i]: split_index[i + 1]]
+    #     block_numbers = numbers_array[split_index[i]: split_index[i + 1]]
+
+    # 确定数字题号的位置,前提:同block题号是某等差数列的子集
+    numbers_sum = numbers_array + np.flipud(numbers_array)
+
+    counts = np.bincount(numbers_sum)
+    mode_times = np.max(counts)
+    mode_value = np.argmax(counts)
+
+    if mode_times != len(numbers) and mode_times >= 2:
+        # 启动题号补全
+
+        number_interval_list = abs(numbers_array[1:] - numbers_array[:-1])
+        number_interval_counts = np.bincount(number_interval_list)
+        # number_interval_mode_times = np.max(number_interval_counts)
+        number_interval_mode_value = np.argmax(number_interval_counts)
+
+        suspect_index = np.where(numbers_sum != mode_value)[0]
+        numbers_array_len = len(numbers_array)
+        for suspect in suspect_index:
+            if suspect == 0:
+                cond_left = False
+                cond_right = numbers_array[suspect + 1] == numbers_array[suspect] + number_interval_mode_value
+            elif suspect == numbers_array_len - 1:
+                cond_right = False
+                cond_left = numbers_array[suspect - 1] == numbers_array[suspect] - number_interval_mode_value
+            else:
+                cond_left = numbers_array[suspect - 1] == numbers_array[suspect] - number_interval_mode_value
+                cond_right = numbers_array[suspect + 1] == numbers_array[suspect] + number_interval_mode_value
+
+            if cond_left or cond_right:
+                pass
+            else:
+                numbers_array[suspect] = -1
+
+        numbers_array = infer_number(numbers_array, number_interval_mode_value)  # 推断题号
+
+    numbers_interval = np.abs(numbers_array[1:] - numbers_array[:-1])
+
+    split_index = [0]
+    for i, interval in enumerate(numbers_interval):
+        if interval > np.mean(numbers_interval):
+            split_index.append(i + 1)
+
+    split_index.append(len(cluster_list))
+    split_index = sorted(list(set(split_index)))
+    block_list = []
+
+    for i in range(len(split_index) - 1):
+        block = cluster_list[split_index[i]: split_index[i + 1]]
+        block_numbers = numbers_array[split_index[i]: split_index[i + 1]]
+
+        xmin = min([ele["loc"][0] for ele in block])
+        ymin = min([ele["loc"][1] for ele in block])
+        xmax = max([ele["loc"][2] for ele in block])
+        ymax = max([ele["loc"][3] for ele in block])
+        mid_x = xmin + (xmax - xmin) // 2
+        mid_y = ymin + (ymax - ymin) // 2
+
+        choice_m = {"numbers": list(block_numbers), "loc": [xmin, ymin, xmax, ymax, mid_x, mid_y]}
+        block_list.append(choice_m)
+
+    return block_list
+
+
+def cluster_and_anti_abnormal(image, xml_path, digital_list, chars_list,
+                              mean_height, mean_width, choice_s_height, choice_s_width, limit_loc):
+    limit_left, limit_top, limit_right, limit_bottom = limit_loc
+    limit_width, limit_height = limit_right - limit_left, limit_bottom - limit_top
+    arr = np.ones((len(digital_list), 2))
+    for i, ele in enumerate(digital_list):
+        arr[i] = np.array([ele["loc"][-2], ele["loc"][-1]])
+
+    if choice_s_height != 0:
+        eps = int(choice_s_height * 2)
+    else:
+        eps = int(mean_height * 2.5)
+    print("eps: ", eps)
+    db = DBSCAN(eps=eps, min_samples=2, metric='chebyshev').fit(arr)
+
+    labels = db.labels_
+    # print(labels)
+
+    cluster_label = []
+    for ele in labels:
+        if ele not in cluster_label and ele != -1:
+            cluster_label.append(ele)
+
+    a_e_dict = {k: [] for k in cluster_label}
+    choice_m_numbers_list = []
+    for index, ele in enumerate(labels):
+        if ele != -1:
+            a_e_dict[ele].append(digital_list[index])
+
+    for ele in cluster_label:
+        cluster = a_e_dict[ele]
+        choice_m_numbers_list += cluster2choice_m(cluster, mean_width)
+
+    all_list_nums = [ele["numbers"] for ele in choice_m_numbers_list]
+    all_nums_len = [len(ele) for ele in all_list_nums]
+    all_nums = list(chain.from_iterable(all_list_nums))
+
+    counts = np.bincount(np.array(all_nums_len))
+    if np.max(counts) < 2:
+        mode_value = max(all_nums_len)
+    else:
+        mode_value = np.argmax(counts)
+        mode_value = all_nums_len[np.where(np.array(all_nums_len) == mode_value)[0][-1]]
+
+    if mode_value > 1:  # 缺失补全
+        error_index_list = list(np.where(np.array(all_nums_len) != mode_value)[0])
+
+        all_height = [ele["loc"][3] - ele["loc"][1] for index, ele
+                      in enumerate(choice_m_numbers_list) if index not in error_index_list]
+        choice_m_mean_height = int(sum(all_height) / len(all_height))
+
+        for e_index in list(error_index_list):
+            current_choice_m = choice_m_numbers_list[e_index]
+            current_numbers_list = list(all_list_nums[e_index])
+            current_len = all_nums_len[e_index]
+            dif = mode_value - current_len
+
+            if 1 in current_numbers_list:
+                t2 = current_numbers_list + [-1] * dif
+                infer_t1_list = infer_number(t2)  # 后补
+                infer_t2_list = infer_number(t2)  # 后补
+                cond1 = False
+                cond2 = True
+            else:
+                t1_cond = [True] * dif
+                t2_cond = [True] * dif
+
+                t1 = [-1] * dif + current_numbers_list
+                infer_t1_list = infer_number(t1)  # 前补
+                t2 = current_numbers_list + [-1] * dif
+                infer_t2_list = infer_number(t2)  # 后补
+
+                for i in range(0, dif):
+                    t1_infer = infer_t1_list[i]
+                    t2_infer = infer_t2_list[-i - 1]
+                    if t1_infer == 0 or t1_infer in all_nums:
+                        t1_cond[i] = False
+                    if t2_infer in all_nums:
+                        t2_cond[i] = False
+                cond1 = not (False in t1_cond)
+                cond2 = not (False in t2_cond)
+
+            if cond1 and not cond2:
+                current_loc = current_choice_m["loc"]
+                current_height = current_loc[3] - current_loc[1]
+
+                infer_height = max((choice_m_mean_height - current_height), int(dif * current_height / current_len))
+                choice_m_numbers_list[e_index]["loc"][1] = current_loc[1] - infer_height
+                choice_m_numbers_list[e_index]["loc"][5] = (choice_m_numbers_list[e_index]["loc"][1] +
+                                                            (choice_m_numbers_list[e_index]["loc"][3] -
+                                                             choice_m_numbers_list[e_index]["loc"][1]) // 2)
+                choice_m_numbers_list[e_index]["numbers"] = infer_t1_list
+                all_nums.extend(infer_t1_list[:dif])
+            if not cond1 and cond2:
+                current_loc = current_choice_m["loc"]
+                current_height = current_loc[3] - current_loc[1]
+
+                infer_height = max((choice_m_mean_height - current_height), int(dif * current_height / current_len))
+                infer_bottom = min(current_loc[3] + infer_height, limit_height-1)
+                if infer_bottom <= limit_height:
+                    choice_m_numbers_list[e_index]["loc"][3] = infer_bottom
+                    choice_m_numbers_list[e_index]["loc"][5] = (choice_m_numbers_list[e_index]["loc"][1] +
+                                                                (choice_m_numbers_list[e_index]["loc"][3] -
+                                                                 choice_m_numbers_list[e_index]["loc"][1]) // 2)
+                    choice_m_numbers_list[e_index]["numbers"] = infer_t2_list
+                    all_nums.extend(infer_t2_list[-dif:])
+            else:
+                # cond1 = cond2 = true, 因为infer选择题时已横向排序, 默认这种情况不会出现
+                pass
+
+    for ele in choice_m_numbers_list:
+        loc = ele["loc"]
+        if loc[3] - loc[1] >= loc[2] - loc[0]:
+            direction = 180
+        else:
+            direction = 90
+        ele.update({'direction': direction})
+    # tree = ET.parse(xml_path)
+    # for index, choice_m in enumerate(choice_m_numbers_list):
+    #     name = str(choice_m["numbers"])
+    #     xmin, ymin, xmax, ymax, _, _ = choice_m["loc"]
+    #     tree = create_xml(name, tree, str(xmin + limit_left), str(ymin + limit_top), str(xmax + limit_left), str(ymax + limit_top))
+    #
+    # tree.write(xml_path)
+
+    choice_m_numbers_list = sorted(choice_m_numbers_list, key=lambda x: x['loc'][3] - x['loc'][1], reverse=True)
+    choice_m_numbers_right_limit = max([ele['loc'][2] for ele in choice_m_numbers_list])
+    remain_len = len(choice_m_numbers_list)
+    choice_m_list = list()
+    need_revised_choice_m_list = list()
+    while remain_len > 0:
+        # 先确定属于同行的数据,然后找字母划分block
+        # random_index = random.randint(0, len(choice_m_numbers_list)-1)
+        random_index = 0
+        # print(random_index)
+        ymax_limit = choice_m_numbers_list[random_index]["loc"][3]
+        ymin_limit = choice_m_numbers_list[random_index]["loc"][1]
+        # choice_m_numbers_list.pop(random_index)
+
+        # 当前行的choice_m
+        current_row_choice_m_d = [ele for ele in choice_m_numbers_list if ymin_limit < ele["loc"][5] < ymax_limit]
+        current_row_choice_m_d = sorted(current_row_choice_m_d, key=lambda x: x["loc"][0])
+        # current_row_choice_m_d.append(choice_m_numbers_list[random_index])
+        split_pix = sorted([ele["loc"][0] for ele in current_row_choice_m_d])  # xmin排序
+        split_index = get_split_index(split_pix)
+        split_pix = [split_pix[ele] for ele in split_index[:-1]]
+
+        block_list = []
+        for i in range(len(split_index) - 1):
+            block = current_row_choice_m_d[split_index[i]: split_index[i + 1]]
+            if len(block) > 1:
+                remain_len = remain_len - (len(block) - 1)
+                numbers_new = []
+                loc_new = [[], [], [], []]
+                for blk in block:
+                    loc_old = blk["loc"]
+                    numbers_new.extend(blk["numbers"])
+                    for ii in range(4):
+                        loc_new[ii].append(loc_old[ii])
+
+                loc_new[0] = min(loc_new[0])
+                loc_new[1] = min(loc_new[1])
+                loc_new[2] = max(loc_new[2])
+                loc_new[3] = max(loc_new[3])
+
+                loc_new.append(loc_new[0] + (loc_new[2] - loc_new[0]) // 2)
+                loc_new.append(loc_new[1] + (loc_new[3] - loc_new[1]) // 2)
+
+                block = [{"numbers": sorted(numbers_new), "loc": loc_new, "direction": block[0]["direction"]}]
+
+            block_list.extend(block)
+
+        current_row_choice_m_d = block_list
+        current_row_chars = [ele for ele in chars_list
+                             if ymin_limit < (ele["location"]["top"] + ele["location"]["height"] // 2) < ymax_limit]
+
+        # if not current_row_chars:
+        #     max_char_width = choice_s_width // 4
+        #     row_chars_xmax = choice_m_numbers_right_limit + int(choice_s_width * 1.5)
+        # else:
+        #     max_char_width = max([ele["location"]["width"] for ele in current_row_chars]) // 2
+        #     row_chars_xmax = max(
+        #         [ele["location"]["left"] + ele["location"]["width"] for ele in current_row_chars]) + max_char_width * 2
+
+        # split_index.append(row_chars_xmax)  # 边界
+        split_pix.append(round(split_pix[-1] + choice_s_width * 1.2))
+        for i in range(0, len(split_index) - 1):
+            left_limit = split_index[i]
+            right_limit = split_index[i + 1]
+            block_chars = [ele for ele in current_row_chars
+                           if left_limit < (ele["location"]["left"] + ele["location"]["width"] // 2) < right_limit]
+
+            # chars_xmin = min([ele["location"]["left"] for ele in block_chars]) - max_char_width
+            # chars_xmax = max(
+            #     [ele["location"]["left"] + ele["location"]["width"] for ele in block_chars]) + max_char_width
+
+            # a_z = '_ABCD_FGH__K_MNOPQRSTUVWXYZ'  EIJL -> _
+            # a_z = '_ABCDEFGHI_K_MNOPQRSTUVWXYZ'
+            a_z = '_ABCD_FGHT'
+            # letter_text = set([ele['char'].upper() for ele in block_chars if ele['char'].upper() in a_z])
+            letter_index = [a_z.index(ele['char'].upper()) for ele in block_chars if ele['char'].upper() in a_z]
+
+            letter_index_times = {ele: 0 for ele in set(letter_index)}
+            for l_index in letter_index:
+                letter_index_times[l_index] += 1
+
+            if (a_z.index("T") in letter_index) and (a_z.index("F") in letter_index):
+                choice_option = "T, F"
+                cols = 2
+            else:
+                if len(letter_index) < 1:
+                    tmp = 4
+                    choice_option = 'A,B,C,D'
+                else:
+                    tmp = max(set(letter_index))
+                # while letter_index_times[tmp] < 2 and tmp > 3:
+                #     t_list = list(set(letter_index))
+                #     t_list.remove(tmp)
+                #     tmp = max(t_list)
+
+                    choice_option = ",".join(a_z[min(letter_index):tmp + 1])
+                cols = tmp
+
+            bias = 3  # pix
+            current_loc = current_row_choice_m_d[i]["loc"]
+            location = dict(xmin=(current_loc[2] + bias) + limit_left,  # 当前数字xmax右边
+                            # xmin=max(current_loc[2] + bias, chars_xmin) + limit_left,
+                            ymin=current_loc[1] + limit_top,
+
+                            xmax=(right_limit - bias) + limit_left,
+                            # xmax=min(chars_xmax, right_limit - bias) + limit_left,
+                            ymax=current_loc[3] + limit_top)
+
+            try:
+                choice_m_img = utils.crop_region(image, location)
+                right_loc, bottom_loc = adjust_choice_m(choice_m_img, mean_height, mean_width * 2)
+                if right_loc > 0:
+                    location.update(dict(xmax=right_loc + location['xmin']))
+                if bottom_loc > 0:
+                    location.update(dict(ymax=bottom_loc + location['ymin']))
+            except Exception as e:
+                print(e)
+                traceback.print_exc()
+
+            tmp_w, tmp_h = location['xmax'] - location['xmin'], location['ymax'] - location['ymin'],
+            numbers = current_row_choice_m_d[i]["numbers"]
+            direction = current_row_choice_m_d[i]["direction"]
+            if direction == 180:
+                choice_m = dict(class_name='choice_m',
+                                number=numbers,
+                                bounding_box=location,
+                                choice_option=choice_option,
+                                default_points=[5] * len(numbers),
+                                direction=direction,
+                                cols=cols,
+                                rows=len(numbers),
+                                single_width=tmp_w // cols,
+                                single_height=tmp_h // len(numbers))
+            else:
+                choice_m = dict(class_name='choice_m',
+                                number=numbers,
+                                bounding_box=location,
+                                choice_option=choice_option,
+                                default_points=[5] * len(numbers),
+                                direction=direction,
+                                cols=len(numbers),
+                                rows=cols,
+                                single_width=tmp_w // len(numbers),
+                                single_height=tmp_h // cols
+                                )
+
+            if tmp_w > 2 * choice_s_width:
+                need_revised_choice_m_list.append(choice_m)
+            else:
+                choice_m_list.append(choice_m)
+
+        remain_len = remain_len - len(current_row_choice_m_d)
+        for ele in choice_m_numbers_list.copy():
+            if ele in current_row_choice_m_d:
+                choice_m_numbers_list.remove(ele)
+
+        for ele in choice_m_numbers_list.copy():
+            if ele in current_row_chars:
+                choice_m_numbers_list.remove(ele)
+
+    # 单独一行不聚类
+    for i, revised_choice_m in enumerate(need_revised_choice_m_list):
+        loc = revised_choice_m['bounding_box']
+        left_part_loc = loc.copy()
+        left_part_loc.update({'xmax': loc['xmin']+choice_s_width})
+        choice_m_img = utils.crop_region(image, left_part_loc)
+        right_loc, bottom_loc = adjust_choice_m(choice_m_img, mean_height, mean_width * 2)
+        if right_loc > 0:
+            left_part_loc.update(dict(xmax=right_loc + left_part_loc['xmin']))
+        if bottom_loc > 0:
+            left_part_loc.update(dict(ymax=bottom_loc + left_part_loc['ymin']))
+
+        left_tmp_height = left_part_loc['ymax'] - left_part_loc['ymin']
+
+        right_part_loc = loc.copy()
+        # right_part_loc.update({'xmin': loc['xmax']-choice_s_width})
+        right_part_loc.update({'xmin': left_part_loc['xmax']+5})
+        choice_m_img = utils.crop_region(image, right_part_loc)
+        right_loc, bottom_loc = adjust_choice_m(choice_m_img, mean_height, mean_width * 2)
+        if right_loc > 0:
+            right_part_loc.update(dict(xmax=right_loc + right_part_loc['xmin']))
+        if bottom_loc > 0:
+            right_part_loc.update(dict(ymax=bottom_loc + right_part_loc['ymin']))
+
+        right_tmp_height = right_part_loc['ymax'] - right_part_loc['ymin']
+
+        number_len = max(1, int(revised_choice_m['rows'] // (left_tmp_height // right_tmp_height)))
+        number = [ele+revised_choice_m['number'][-1]+1 for ele in range(number_len)]
+        rows = len(number)
+
+        revised_choice_m.update({'bounding_box': left_part_loc})
+        choice_m_list.append(revised_choice_m)
+
+        tmp = revised_choice_m.copy()
+        tmp.update({'bounding_box': right_part_loc, 'number': number, 'rows': rows})
+        choice_m_list.append(tmp)
+
+    tmp = choice_m_list.copy()
+    for ele in tmp:
+        loc = ele["bounding_box"]
+        w, h = loc['xmax'] - loc['xmin'], loc['ymax'] - loc['ymin']
+        if w*h < choice_s_width*choice_s_height:
+            choice_m_list.remove(ele)
+    return choice_m_list
+
+
+def infer_choice_m(image, tf_sheet, ocr, xml=None):
+    infer_box_list = ocr2sheet(image, tf_sheet, ocr, xml)
+    # print(sheet_region_list)
+    choice_m_list = []
+
+    choice_s_h_list = [int(ele['bounding_box']['ymax']) - int(ele['bounding_box']['ymin']) for ele in tf_sheet
+                       if ele['class_name'] == 'choice_s']
+    if choice_s_h_list:
+        choice_s_height = sum(choice_s_h_list) // len(choice_s_h_list)
+    else:
+        choice_s_height = 0
+
+    choice_s_w_list = [int(ele['bounding_box']['xmax']) - int(ele['bounding_box']['xmin']) for ele in tf_sheet
+                       if ele['class_name'] == 'choice_s']
+    if choice_s_w_list:
+        choice_s_width = sum(choice_s_w_list) // len(choice_s_w_list)
+
+    else:
+        choice_s_width = 0
+
+    for infer_box in infer_box_list:
+        # {'loc': [240, 786, 1569, 1368]}
+        loc = infer_box['loc']
+        xmin, ymin, xmax, ymax = loc[0], loc[1], loc[2], loc[3]
+        choice_flag = False
+
+        for ele in tf_sheet:
+            if ele['class_name'] in ['choice_m', 'choice_s']:
+                tf_loc = ele['bounding_box']
+                tf_loc_l = tf_loc['xmin']
+                tf_loc_t = tf_loc['ymin']
+                if xmin < tf_loc_l < xmax and ymin < tf_loc_t < ymax:
+                    choice_flag = True
+                    break
+
+        if choice_flag:
+            infer_image = utils.crop_region_direct(image, loc)
+            try:
+                save_dir = os.path.join(settings.MEDIA_ROOT, 'tmp')
+                if not os.path.exists(save_dir):
+                    os.makedirs(save_dir)
+                save_path = os.path.join(save_dir, 'choice.jpeg')
+                cv2.imwrite(save_path, infer_image)
+                img_tmp = utils.read_single_img(save_path)
+                os.remove(save_path)
+                ocr = brain_api.get_ocr_text_and_coordinate(img_tmp, 'accurate', 'CHN_ENG')
+            except Exception as e:
+                print('write choice and ocr failed')
+                traceback.print_exc()
+                ocr = brain_api.get_ocr_text_and_coordinate(infer_image, 'accurate', 'CHN_ENG')
+
+            try:
+                digital_list, chars_list, digital_mean_h, digital_mean_w = find_digital(ocr)
+                choice_m = cluster_and_anti_abnormal(image, xml, digital_list, chars_list,
+                                                     digital_mean_h, digital_mean_w,
+                                                     choice_s_height, choice_s_width, loc)
+
+                choice_m_list.extend(choice_m)
+            except Exception as e:
+                traceback.print_exc()
+                print('not found choice feature')
+                pass
+
+    # print(choice_m_list)
+    # tf_choice_sheet = [ele for ele in tf_sheet if ele['class_name'] == 'choice_m']
+
+    sheet_tmp = choice_m_list.copy()
+    remove_index = []
+    for i, region in enumerate(sheet_tmp):
+        if i not in remove_index:
+            box = region['bounding_box']
+            for j, region_in in enumerate(sheet_tmp):
+                box_in = region_in['bounding_box']
+                iou = utils.cal_iou(box, box_in)
+                if iou[0] > 0.85 and i != j:
+                    choice_m_list.remove(region)
+                    remove_index.append(j)
+                    break
+
+    return choice_m_list

+ 2932 - 0
segment/sheet_resolve/analysis/sheet/ocr_key_words.py

@@ -0,0 +1,2932 @@
+import re
+
+
+def find_repeat(source, elmt):  # 去重后重新定位数字索引
+    elmt_index = []
+    s_index = 0
+    e_index = len(source)
+    while (s_index < e_index):
+        try:
+            temp = source.index(elmt, s_index, e_index)
+            elmt_index.append(temp)
+            s_index = temp + 1
+        except ValueError:
+            break
+    return elmt_index
+
+
+def ocr_key_words(rect,type_score_dict):  # 将ocr识别得到的文字与模型得到的type_score对应
+    '''
+    :param rect: OCR识别结果数组,格式:res = {'chars': [},'coordinates': [(),()},'words': []}
+    :param type_score_dict: 模型得到的type_score(与模型得到的边框相对应)
+    :return: 字典中添加word
+    '''
+    len_ocr = len(rect['chars'])
+    xmin = type_score_dict['type_box'][0]
+    ymin = type_score_dict['type_box'][1]
+    xmax = type_score_dict['type_box'][2]
+    ymax = type_score_dict['type_box'][3]
+    words=[]
+
+    for j in range(len_ocr):
+        if rect['coordinates'][j][0] - xmin > -30 and rect['coordinates'][j][1] - ymin > -30 and rect['coordinates'][j][2] - xmax < 30 and rect['coordinates'][j][3] - ymax < 30:
+            word = rect['chars'][j]
+            words.append(word)
+    type_score_dict['words']= words
+    type_score_dict_ocr = type_score_dict
+
+    return type_score_dict_ocr
+
+
+def key_words(type_score_dict_ocr):  # 根据OCR结果结合关键字解析
+
+    total_score = 0
+    volume_score = 0
+    volume_structure_item = 0
+    volume_structure = []
+    Score_structure_item = 0
+    Score_structure = []
+    all_structure = {}
+    keyword_volume = ['第卷', '第部']
+    keyword_type = ['选择', '非选择题', '综合题', '问答题', '主观题', '客观题', '解答题','计算题']
+    len_keyword_type = len(keyword_type)
+    keyword_item1 = ['共分', '合计分', '总共分', '总计分', '小题满分', '本小题', '满分', '共计', '共.分', '合计.分', '总共.分', '总计.分', '小题满分.','本小题.', '满分.', '共计.']
+    len_keyword_item1 = len(keyword_item1)
+    keyword_item2 = ['每题分', '每小题分', '空分', '每小题.分', '每题.分', '空.分']  # '分/题'暂未考虑
+    len_keyword_item2 = len(keyword_item2)
+    keyword_item3 = ['共题', '共小题', '分小题', '本题小题', '共个小题', '分为小题', '分个小题','本大题共小题']
+    len_keyword_item3 = len(keyword_item3)
+    keyword_item4 = ['分']
+    len_keyword_item4 = len(keyword_item4)
+    keyword_item5 = ['分/题']
+    len_keyword_item5 = len(keyword_item5)
+    keyword_item5 = ['题', '.', '、']
+
+    ocr_1 = type_score_dict_ocr['words']
+    s = ''.join((str(x) for x in ocr_1))  # 合并为一个字符串
+    if s.find('IV') != -1 or s.find('Ⅳ') != -1:
+        s = s.replace('Ⅳ', '4')
+        s = s.replace('IV', '4')
+    elif s.find('III') != -1 or s.find('Ⅲ') != -1:
+        s = s.replace('Ⅲ', '3')
+        s = s.replace('III', '3')
+    elif s.find('II') != -1 or s.find('Ⅱ') != -1:
+        s = s.replace('Ⅱ', '2')
+        s = s.replace('II', '2')
+    elif s.find('VI') != -1 or s.find('Ⅵ') != -1:
+        s = s.replace('Ⅵ', '6')
+        s = s.replace('VI', '6')
+    elif s.find('VII') != -1 or s.find('Ⅶ') != -1:
+        s = s.replace('Ⅶ', '7')
+        s = s.replace('VII', '7')
+    elif s.find('VIII') != -1 or s.find('Ⅷ') != -1:
+        s = s.replace('Ⅷ', '8')
+        s = s.replace('VIII', '8')
+    elif s.find('IX') != -1 or s.find('Ⅸ') != -1:
+        s = s.replace('Ⅸ', '9')
+        s = s.replace('IX', '9')
+    elif s.find('X') != -1 or s.find('Ⅹ') != -1:
+        s = s.replace('Ⅹ', '10')
+        s = s.replace('X', '10')
+    elif s.find('I') != -1 or s.find('Ⅰ') != -1:
+        s = s.replace('Ⅰ', '1')
+        s = s.replace('I', '1')
+    elif s.find('V') != -1 or s.find('Ⅴ') != -1:
+        s = s.replace('Ⅴ', '5')
+        s = s.replace('V', '5')
+
+    C_s = re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", s)  # 提取汉字
+    E_s = ''.join(re.findall(r'[A-Za-z]', s))  # 提取英文字符
+    N_s = re.findall('\d+', s)  # 提取阿拉伯数字
+
+    if len(N_s) == 1 and len(N_s[0]) < 6 and len(E_s) == 0 and (C_s == keyword_item5[0] or C_s == keyword_item5[1] or len(C_s) == 0):
+        type_score_dict_ocr['item_N'] = int(N_s[0])
+        type_score_dict_ocr['item_total_score'] = -1
+        type_score_dict_ocr['item_count'] = -1
+        type_score_dict_ocr['item_score'] = -1
+        Score_structure_item = type_score_dict_ocr
+        Score_structure.append(Score_structure_item)
+        all_structure = {'volume_structure': -1,
+                         'Score_structure': Score_structure}
+    elif N_s != []:
+        for iiii in range(len(keyword_volume)):
+            Score_structure_item = {}
+            if C_s.find(keyword_volume[iiii]) != -1:
+                '''
+                对应试卷中存在分卷信息的情况,根据包含数字的个数分为5类,暂定包含信息的有效数字个数小于5,并处理小题分数和总分可能包含小数点的情况
+                暂定小题个数不包含小数
+                暂定总分数中不存在有意义的小数位
+                '''
+                if len(N_s) == 1:
+                    num_index = s.index(N_s[0])
+                    num_infer = s[num_index - len(N_s[0])]
+                    num_back = s[num_index + len(N_s[0])]
+                    if num_back == '分':  # 第卷/部*分
+                        volume_score = int(N_s[0])
+                        type_score_dict_ocr['volume_N'] = -1
+                        type_score_dict_ocr['volume_total_score'] = volume_score
+                        type_score_dict_ocr['volume_count'] = -1
+                        type_score_dict_ocr['volume_score'] = -1
+                    elif num_back == '卷' or num_back == '部':  # 第*卷
+                        volume_N = int(N_s[0])
+                        type_score_dict_ocr['volume_N'] = volume_N
+                        type_score_dict_ocr['volume_total_score'] = -1
+                        type_score_dict_ocr['volume_count'] = -1
+                        type_score_dict_ocr['volume_score'] = -1
+                elif len(N_s) == 2:
+                    num_index1 = s.index(N_s[0])
+                    num_infer1 = s[num_index1 - len(N_s[0])]
+                    num_back1 = s[num_index1 + len(N_s[0])]
+                    all_1 = find_repeat(s, N_s[1])
+                    temp1 = 0
+                    for ii in range(len(N_s[0])):
+                        if N_s[0][ii] == N_s[1]:
+                            temp1 = temp1 + 1
+                    num_index2 = all_1[temp1]
+                    num_infer2 = s[num_index2 - len(N_s[1])]
+                    num_back2 = s[num_index2 + len(N_s[1])]
+                    if isinstance(N_s[0], str):
+                        N_s[0] = int(N_s[0])
+                    if isinstance(N_s[1], str):
+                        N_s[1] = int(N_s[1])
+                    for k in range(len_keyword_item1):
+                        if C_s.find(keyword_item1[k]) != -1:
+                            if (num_back1 == '卷' or num_back1 == '部') and num_back2 == '分':  # 第*卷*分
+                                volume_N = N_s[0]
+                                volume_score = N_s[1]
+                                type_score_dict_ocr['volume_N'] = volume_N
+                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                type_score_dict_ocr['volume_count'] = -1
+                                type_score_dict_ocr['volume_score'] = -1
+                                break
+                            elif num_back1 == '.' and num_infer2 == '.' and num_back2 == '分':  # 第卷,共*.*分
+                                volume_N = -1
+                                volume_score = N_s[0]
+                                type_score_dict_ocr['volume_N'] = volume_N
+                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                type_score_dict_ocr['volume_count'] = -1
+                                type_score_dict_ocr['volume_score'] = -1
+                                break
+                            else:
+                                for l in range(len_keyword_item2):
+                                    if C_s.find(keyword_item2[l]) != -1:
+                                        if (num_infer1 == '题' or num_infer1 == '空') and num_back2 == '分':  # 第卷,每小题*分,共*分
+                                            volume_score = N_s[1]
+                                            item_score = N_s[0]
+                                            item_count = int(volume_score / item_score)
+                                            type_score_dict_ocr['volume_N'] = -1
+                                            type_score_dict_ocr['volume_total_score'] = volume_score
+                                            type_score_dict_ocr['volume_count'] = item_count
+                                            type_score_dict_ocr['volume_score'] = item_score
+                                            break
+                                        elif (num_infer2 == '题' or num_infer2 == '空') and num_back1 == '分':  # 第卷,共*分 ,每小题*分
+                                            volume_score = N_s[0]
+                                            item_score = N_s[1]
+                                            item_count = int(volume_score / item_score)
+                                            type_score_dict_ocr['volume_N'] = -1
+                                            type_score_dict_ocr['volume_total_score'] = volume_score
+                                            type_score_dict_ocr['volume_count'] = item_count
+                                            type_score_dict_ocr['volume_score'] = item_score
+                                            break
+                                    elif l == len(keyword_item2) - 1:
+                                        for m in range(len_keyword_item3):
+                                            if C_s.find(keyword_item3[l]) != -1:
+                                                if num_back2 == '分':  # 第卷,共*小题,共*分
+                                                    volume_score = N_s[1]
+                                                    item_count = N_s[0]
+                                                    item_score = volume_score / item_count
+                                                    type_score_dict_ocr['volume_N'] = -1
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                elif num_back1 == '分':  # 第卷,共*分 ,共*小题
+                                                    volume_score = N_s[0]
+                                                    item_count = N_s[1]
+                                                    item_score = volume_score / item_count
+                                                    type_score_dict_ocr['volume_N'] = -1
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                    break
+                            break
+                        elif k == len_keyword_item1 - 1:
+                            for l in range(len_keyword_item2):
+                                if C_s.find(keyword_item2[l]) != -1:
+                                    for m in range(len_keyword_item3):
+                                        if C_s.find(keyword_item3[l]) != -1:
+                                            if num_back2 == '分':  # 第卷,共*小题,每小题*分
+                                                item_count = N_s[0]
+                                                item_score = N_s[1]
+                                                volume_score = item_score * item_count
+                                                type_score_dict_ocr['volume_N'] = -1
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif num_back1 == '分':  # 第卷,每小题*分 ,共*小题
+                                                item_count = N_s[1]
+                                                item_score = N_s[0]
+                                                volume_score = item_count * item_score
+                                                type_score_dict_ocr['volume_N'] = -1
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                        elif m == len_keyword_item3 - 1:
+                                            if num_back2 == '分':  # 第卷,每小题*.*分
+                                                volume_score = -1
+                                                item_count = -1
+                                                item_score = float(N_s[0] + '.' + N_s[1])
+                                                type_score_dict_ocr['volume_N'] = -1
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+
+                                break
+                            break
+                elif len(N_s) == 3:
+                    num_index1 = s.index(N_s[0])
+                    num_infer1 = s[num_index1 - len(N_s[0])]
+                    num_back1 = s[num_index1 + len(N_s[0])]
+                    all_1 = find_repeat(s, N_s[1])
+                    temp1 = 0
+                    for ii in range(len(N_s[0])):
+                        if N_s[0][ii] == N_s[1]:
+                            temp1 = temp1 + 1
+                    num_index2 = all_1[temp1]
+                    num_infer2 = s[num_index2 - len(N_s[1])]
+                    num_back2 = s[num_index2 + len(N_s[1])]
+                    all_2 = find_repeat(s, N_s[2])
+                    temp2 = 0
+                    for ii in range(len(N_s[0])):
+                        if N_s[0][ii] == N_s[2]:
+                            temp2 = temp2 + 1
+                    for jj in range(len(N_s[1])):
+                        if N_s[1][jj] == N_s[2]:
+                            temp2 = temp2 + 1
+                    num_index3 = all_2[temp2]
+                    num_infer3 = s[num_index3 - len(N_s[2])]
+                    num_back3 = s[num_index3 + len(N_s[2])]
+                    if isinstance(N_s[0], str):
+                        N_s[0] = int(N_s[0])
+                    if isinstance(N_s[1], str):
+                        N_s[1] = int(N_s[1])
+                    if isinstance(N_s[2], str):
+                        N_s[2] = int(N_s[2])
+                    for l in range(len_keyword_item3):
+                        if C_s.find(keyword_item3[l]) != -1:
+                            for m in range(len_keyword_item2):
+                                if C_s.find(keyword_item2[m]) != -1:
+                                    if (num_back1 == '卷' or num_back1 == '部') and num_back3 == '分':  # 第*卷,共*题,每题*分
+                                        volume_N = N_s[0]
+                                        item_count = N_s[1]
+                                        item_score = N_s[2]
+                                        volume_score = item_count * item_score
+                                        type_score_dict_ocr['volume_N'] = volume_N
+                                        type_score_dict_ocr['volume_total_score'] = volume_score
+                                        type_score_dict_ocr['volume_count'] = item_count
+                                        type_score_dict_ocr['volume_score'] = item_score
+                                        break
+                                    elif (num_back1 == '卷' or num_back1 == '部') and num_back2 == '分':  # 第*卷,每题*分,共*题
+                                        volume_N = N_s[0]
+                                        item_count = N_s[2]
+                                        item_score = N_s[1]
+                                        volume_score = item_count * item_score
+                                        type_score_dict_ocr['volume_N'] = volume_N
+                                        type_score_dict_ocr['volume_total_score'] = volume_score
+                                        type_score_dict_ocr['volume_count'] = item_count
+                                        type_score_dict_ocr['volume_score'] = item_score
+                                        break
+                                    elif (num_back1 == '卷' or num_back1 == '部') and num_back2 == '分':  # 第卷,每题*.*分,共*题
+                                        volume_N = -1
+                                        item_score = float(N_s[0] + '.' + N_s[1])
+                                        item_count = N_s[2]
+                                        volume_score = item_score * item_count
+                                        type_score_dict_ocr['volume_N'] = volume_N
+                                        type_score_dict_ocr['volume_total_score'] = volume_score
+                                        type_score_dict_ocr['volume_count'] = item_count
+                                        type_score_dict_ocr['volume_score'] = item_score
+                                        break
+                                    elif (num_back1 == '卷' or num_back1 == '部') and num_back3 == '分':  # 第卷,共*题,每题*.*分
+                                        volume_N = -1
+                                        item_score = float(N_s[1] + '.' + N_s[2])
+                                        item_count = N_s[0]
+                                        volume_score = item_score * item_count
+                                        type_score_dict_ocr['volume_N'] = volume_N
+                                        type_score_dict_ocr['volume_total_score'] = volume_score
+                                        type_score_dict_ocr['volume_count'] = item_count
+                                        type_score_dict_ocr['volume_score'] = item_score
+                                        break
+                                    elif num_back1 == '分' and (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分':  # 第卷,共*分,每题*分,共*题
+                                        volume_score = N_s[0]
+                                        item_count = N_s[2]
+                                        item_score = N_s[1]
+                                        type_score_dict_ocr['volume_N'] = -1
+                                        type_score_dict_ocr['volume_total_score'] = volume_score
+                                        type_score_dict_ocr['volume_count'] = item_count
+                                        type_score_dict_ocr['volume_score'] = item_score
+                                        break
+                                    elif num_back1 == '分' and (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分':  # 第卷,共*分,共*题,每题*分
+                                        volume_score = N_s[0]
+                                        item_count = N_s[1]
+                                        item_score = N_s[2]
+                                        type_score_dict_ocr['volume_N'] = -1
+                                        type_score_dict_ocr['volume_total_score'] = volume_score
+                                        type_score_dict_ocr['volume_count'] = item_count
+                                        type_score_dict_ocr['volume_score'] = item_score
+                                        break
+                                    elif num_back2 == '分' and (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分':  # 第卷,共*题,共*分,每题*分
+                                        volume_score = N_s[1]
+                                        item_count = N_s[0]
+                                        item_score = N_s[2]
+                                        type_score_dict_ocr['volume_N'] = -1
+                                        type_score_dict_ocr['volume_total_score'] = volume_score
+                                        type_score_dict_ocr['volume_count'] = item_count
+                                        type_score_dict_ocr['volume_score'] = item_score
+                                        break
+                                    elif num_infer3 == '分' and (num_infer1 == '题' or num_infer1 == '空') and num_back1 == '分':  # 第卷,每题*分,共*题,共*分
+                                        volume_score = N_s[2]
+                                        item_count = N_s[1]
+                                        item_score = N_s[0]
+                                        type_score_dict_ocr['volume_N'] = -1
+                                        type_score_dict_ocr['volume_total_score'] = volume_score
+                                        type_score_dict_ocr['volume_count'] = item_count
+                                        type_score_dict_ocr['volume_score'] = item_score
+                                        break
+                                    elif num_back3 == '分' and (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分':  # 第卷,共*题,每题*分,共*分
+                                        volume_score = N_s[2]
+                                        item_count = N_s[0]
+                                        item_score = N_s[1]
+                                        type_score_dict_ocr['volume_N'] = -1
+                                        type_score_dict_ocr['volume_total_score'] = volume_score
+                                        type_score_dict_ocr['volume_count'] = item_count
+                                        type_score_dict_ocr['volume_score'] = item_score
+                                        break
+                                    elif num_back3 == '分' and (num_infer1 == '题' or num_infer1 == '空') and num_back1 == '分':  # 第卷,每题*分,共*题,共*分
+                                        volume_score = N_s[2]
+                                        item_count = N_s[1]
+                                        item_score = N_s[0]
+                                        type_score_dict_ocr['volume_N'] = -1
+                                        type_score_dict_ocr['volume_total_score'] = volume_score
+                                        type_score_dict_ocr['volume_count'] = item_count
+                                        type_score_dict_ocr['volume_score'] = item_score
+                                        break
+                                elif m == len_keyword_item2 - 1:
+                                    for n in range(len_keyword_item1):
+                                        if C_s.find(keyword_item1[n]) != -1:
+                                            if (num_back1 == '卷' or num_back1 == '部') and (num_back2 == '题' or num_back2 == '小') and num_back3 == '分':  # 第*卷,共*题,共*分
+                                                volume_N = N_s[0]
+                                                volume_score = N_s[2]
+                                                item_count = N_s[1]
+                                                item_score = volume_score / item_count
+                                                type_score_dict_ocr['volume_N'] = volume_N
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and (num_back3 == '题' or num_back3 == '小') and num_back2 == '分':  # 第*卷,共*分,共*题
+                                                volume_N = N_s[0]
+                                                volume_score = N_s[1]
+                                                item_count = N_s[2]
+                                                item_score = volume_score / item_count
+                                                type_score_dict_ocr['volume_N'] = volume_N
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif num_back1 == '.' and num_infer2 == '.' and (num_back3 == '题' or num_back3 == '小') and num_back2 == '分':  # 第卷,共*.*分,共*题
+                                                volume_N = -1
+                                                volume_score = N_s[0]
+                                                item_count = N_s[2]
+                                                item_score = volume_score / item_count
+                                                type_score_dict_ocr['volume_N'] = volume_N
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif num_back2 == '.' and num_infer3 == '.' and (num_back1 == '题' or num_back1 == '小') and num_back3 == '分':  # 第卷,共*题,共*.*分
+                                                volume_N = -1
+                                                volume_score = N_s[1]
+                                                item_count = N_s[0]
+                                                item_score = volume_score / item_count
+                                                type_score_dict_ocr['volume_N'] = volume_N
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+
+                                break
+                            break
+                        elif l == len_keyword_item3 - 1:
+                            for p in range(len_keyword_item1):
+                                if C_s.find(keyword_item1[p]) != -1:
+                                    for q in range(len_keyword_item2):
+                                        if C_s.find(keyword_item2[q]) != -1:
+                                            if (num_back1 == '卷' or num_back1 == '部') and num_back2 == '分' and num_back3 == '分':  # 第*卷,共*分,每题*分 /  第*卷,每题*分,共*分
+                                                volume_N = int(N_s[0])
+                                                if N_s[1] > N_s[2]:
+                                                    volume_score = N_s[1]
+                                                    item_score = N_s[2]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[2]
+                                                    item_score = N_s[1]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif num_back1 == '.' and num_infer2 == '.' and num_back2 == '分' and num_back3 == '分':  # 第卷,共*.*分,每题*分 /  第卷,每题*.*分,共*分
+                                                volume_N = -1
+                                                if N_s[0] > N_s[2]:
+                                                    volume_score = N_s[0]
+                                                    item_score = N_s[2]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[2]
+                                                    item_score = float(N_s[0] + '.' + N_s[1])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif num_back2 == '.' and num_infer3 == '.' and num_back1 == '分' and num_back3 == '分':  # 第卷,共*分,每题*.*分 /  第卷,每题*分,共*.*分
+                                                volume_N = -1
+                                                if N_s[0] > N_s[1]:
+                                                    volume_score = N_s[0]
+                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[1]
+                                                    item_score = N_s[0]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+
+
+                                    break
+                            break
+                elif len(N_s) == 4:
+                    num_index1 = s.index(N_s[0])
+                    num_infer1 = s[num_index1 - len(N_s[0])]
+                    num_back1 = s[num_index1 + len(N_s[0])]
+                    all_1 = find_repeat(s, N_s[1])
+                    temp1 = 0
+                    for ii in range(len(N_s[0])):
+                        if N_s[0][ii] == N_s[1]:
+                            temp1 = temp1 + 1
+                    num_index2 = all_1[temp1]
+                    num_infer2 = s[num_index2 - len(N_s[1])]
+                    num_back2 = s[num_index2 + len(N_s[1])]
+                    all_2 = find_repeat(s, N_s[2])
+                    temp2 = 0
+                    for ii in range(len(N_s[0])):
+                        if N_s[0][ii] == N_s[2]:
+                            temp2 = temp2 + 1
+                    for jj in range(len(N_s[1])):
+                        if N_s[1][jj] == N_s[2]:
+                            temp2 = temp2 + 1
+                    num_index3 = all_2[temp2]
+                    num_infer3 = s[num_index3 - len(N_s[2])]
+                    num_back3 = s[num_index3 + len(N_s[2])]
+                    all_3 = find_repeat(s, N_s[3])
+                    temp3 = 0
+                    for ii in range(len(N_s[0])):
+                        if N_s[0][ii] == N_s[3]:
+                            temp3 = temp3 + 1
+                    for jj in range(len(N_s[1])):
+                        if N_s[1][jj] == N_s[3]:
+                            temp3 = temp3 + 1
+                    for kk in range(len(N_s[2])):
+                        if N_s[2][kk] == N_s[3]:
+                            temp3 = temp3 + 1
+                    num_index4 = all_3[temp3]
+                    num_infer4 = s[num_index4 - len(N_s[3])]
+                    num_back4 = s[num_index4 + len(N_s[3])]
+                    if isinstance(N_s[0], str):
+                        N_s[0] = int(N_s[0])
+                    if isinstance(N_s[1], str):
+                        N_s[1] = int(N_s[1])
+                    if isinstance(N_s[2], str):
+                        N_s[2] = int(N_s[2])
+                    if isinstance(N_s[3], str):
+                        N_s[3] = int(N_s[3])
+                    for l in range(len_keyword_item1):
+                        if C_s.find(keyword_item1[l]) != -1:
+                            for m in range(len_keyword_item2):
+                                if C_s.find(keyword_item2[m]) != -1:
+                                    for n in range(len_keyword_item3):
+                                        if C_s.find(keyword_item3[n]) != -1:
+                                            if (num_back1 == '卷' or num_back1 == '部') and (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分' and num_back4 == '分':  # 第*卷,每题*分,共*题,共*分
+                                                volume_N = N_s[0]
+                                                volume_score = N_s[3]
+                                                item_count = N_s[2]
+                                                item_score = N_s[1]
+                                                type_score_dict_ocr['volume_N'] = volume_N
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分' and num_back4 == '分':  # 第*卷,共*题,每题*分,共*分
+                                                volume_N = N_s[0]
+                                                volume_score = N_s[3]
+                                                item_count = N_s[1]
+                                                item_score = N_s[2]
+                                                type_score_dict_ocr['volume_N'] = volume_N
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and (num_infer4 == '题' or num_infer4 == '空') and num_back4 == '分' and num_back2 == '分':  # 第*卷,共*分,共*题,每题*分
+                                                volume_N = N_s[0]
+                                                volume_score = N_s[1]
+                                                item_count = N_s[2]
+                                                item_score = N_s[3]
+                                                type_score_dict_ocr['volume_N'] = volume_N
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and (num_infer4 == '题' or num_infer4 == '空') and num_back4 == '分' and num_back3 == '分':  # 第*卷,共*题,共*分,每题*分
+                                                volume_N = N_s[0]
+                                                volume_score = N_s[2]
+                                                item_count = N_s[1]
+                                                item_score = N_s[3]
+                                                type_score_dict_ocr['volume_N'] = volume_N
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分' and num_back2 == '分':  # 第*卷,共*分,每题*分,共*题
+                                                volume_N = N_s[0]
+                                                volume_score = N_s[1]
+                                                item_count = N_s[3]
+                                                item_score = N_s[2]
+                                                type_score_dict_ocr['volume_N'] = volume_N
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分' and num_back3 == '分':  # 第*卷,每题*分,共*分,共*题
+                                                volume_N = N_s[0]
+                                                volume_score = N_s[2]
+                                                item_count = N_s[3]
+                                                item_score = N_s[1]
+                                                type_score_dict_ocr['volume_N'] = volume_N
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+
+                                            elif (num_back1 == '.' and num_infer2 == '.') and num_back2 == '分' and num_back3 == '分':  # 第卷,每题*.*分,共*分,共*题/第卷,共*.*分,每题*分,共*题
+                                                volume_N = -1
+                                                if int(N_s[0]) > int(N_s[2]):
+                                                    volume_score = N_s[0]
+                                                    item_score = N_s[2]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[2]
+                                                    item_score = float(N_s[0] + '.' + N_s[1])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif (num_back2 == '.' and num_infer3 == '.') and num_back1 == '分' and num_back3 == '分':  # 第卷,每题*分,共*.*分,共*题/第卷,共*分,每题*.*分,共*题
+                                                volume_N = -1
+                                                if int(N_s[0]) > int(N_s[1]):
+                                                    volume_score = N_s[0]
+                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[1]
+                                                    item_score = N_s[0]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif (num_back2 == '.' and num_infer3 == '.') and num_back3 == '分' and num_back4 == '分':  # 第卷,共*题,共*.*分,每题*分/第卷,共*题,每题*.*分,共*分
+                                                volume_N = -1
+                                                if N_s[1] > N_s[3]:
+                                                    volume_score = N_s[1]
+                                                    item_score = N_s[3]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[4]
+                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif (num_back3 == '.' and num_infer4 == '.') and num_back2 == '分' and num_back4 == '分':  # 第卷,共*题,共*分,每题*.*分/第卷,共*题,每题*分,共*.*分
+                                                volume_N = -1
+                                                if int(N_s[1]) > int(N_s[2]):
+                                                    volume_score = N_s[1]
+                                                    item_score = float(N_s[2] + '.' + N_s[3])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[2]
+                                                    item_score = N_s[1]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif (num_back1 == '.' and num_infer2 == '.') and num_back2 == '分' and num_back4 == '分':  # 第卷,共*.*分,共*题,每题*分/第卷,每题*.*分,共*题,共*分
+                                                volume_N = -1
+                                                if int(N_s[0]) > int(N_s[3]):
+                                                    volume_score = N_s[0]
+                                                    item_score = N_s[3]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[3]
+                                                    item_score = float(N_s[0] + '.' + N_s[1])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif (num_back3 == '.' and num_infer4 == '.') and num_back1 == '分' and num_back4 == '分':  # 第卷,共*分,共*题,每题*.*分/第卷,每题*分,共*题,共*.*分
+                                                volume_N = -1
+                                                if int(N_s[0]) > int(N_s[2]):
+                                                    volume_score = N_s[0]
+                                                    item_score = N_s[2] + '.' + N_s[3]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[2]
+                                                    item_score = N_s[0]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                        elif n == len_keyword_item3 - 1:
+                                            if (num_back1 == '卷' or num_back1 == '部') and num_back2 == '.' and num_infer3 == '.' and num_back3 == '分' and num_back4 == '分':  # 第*卷,每题*.*分,共*分/第*卷,共*.*分,每题*分
+                                                volume_N = int(N_s[0])
+                                                if N_s[1] > N_s[3]:
+                                                    volume_score = N_s[1]
+                                                    item_score = N_s[3]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[3]
+                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and num_back3 == '.' and num_infer4 == '.' and num_back2 == '分' and num_back4 == '分':  # 第*卷,每题*分,共*.*分/第*卷,共*分,每题*.*分
+                                                volume_N = int(N_s[0])
+                                                if int(N_s[1]) > int(N_s[2]):
+                                                    volume_score = N_s[1]
+                                                    item_score = float(N_s[2] + '.' + N_s[3])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[2]
+                                                    item_score = N_s[1]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif num_back1 == '.' and num_infer2 == '.' and num_back2 == '分' and num_back3 == '.' and num_infer4 == '.' and num_back4 == '分':  # 第卷,每题*.*分,共*.*分/第卷,共*.*分,每题*.*分
+                                                volume_N = -1
+                                                if N_s[0] > N_s[2]:
+                                                    volume_score = float(N_s[2] + '.' + N_s[3])
+                                                    item_score = N_s[3]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[3]
+                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分' and num_back3 == '分':  # 第*卷,每题*分,共*分
+                                                volume_N = N_s[0]
+                                                volume_score = N_s[2]
+                                                item_count = -1
+                                                item_score = N_s[1]
+                                                type_score_dict_ocr['volume_N'] = volume_N
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and (num_infer3 == '题' or num_infer3 == '空') and num_back2 == '分' and num_back3 == '分':  # 第*卷,共*分,每题*分
+                                                volume_N = N_s[0]
+                                                volume_score = N_s[1]
+                                                item_count = -1
+                                                item_score = N_s[2]
+                                                type_score_dict_ocr['volume_N'] = volume_N
+                                                type_score_dict_ocr['volume_total_score'] = volume_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+
+                                    break
+                            break
+                elif len(N_s) == 5:
+                    num_index1 = s.index(N_s[0])
+                    num_infer1 = s[num_index1 - len(N_s[0])]
+                    num_back1 = s[num_index1 + len(N_s[0])]
+                    all_1 = find_repeat(s, N_s[1])
+                    temp1 = 0
+                    for ii in range(len(N_s[0])):
+                        if N_s[0][ii] == N_s[1]:
+                            temp1 = temp1 + 1
+                    num_index2 = all_1[temp1]
+                    num_infer2 = s[num_index2 - len(N_s[1])]
+                    num_back2 = s[num_index2 + len(N_s[1])]
+                    all_2 = find_repeat(s, N_s[2])
+                    temp2 = 0
+                    for ii in range(len(N_s[0])):
+                        if N_s[0][ii] == N_s[2]:
+                            temp2 = temp2 + 1
+                    for jj in range(len(N_s[1])):
+                        if N_s[1][jj] == N_s[2]:
+                            temp2 = temp2 + 1
+                    num_index3 = all_2[temp2]
+                    num_infer3 = s[num_index3 - len(N_s[2])]
+                    num_back3 = s[num_index3 + len(N_s[2])]
+                    all_3 = find_repeat(s, N_s[3])
+                    temp3 = 0
+                    for ii in range(len(N_s[0])):
+                        if N_s[0][ii] == N_s[3]:
+                            temp3 = temp3 + 1
+                    for jj in range(len(N_s[1])):
+                        if N_s[1][jj] == N_s[3]:
+                            temp3 = temp3 + 1
+                    for kk in range(len(N_s[2])):
+                        if N_s[2][kk] == N_s[3]:
+                            temp3 = temp3 + 1
+                    num_index4 = all_3[temp3]
+                    num_infer4 = s[num_index4 - len(N_s[3])]
+                    num_back4 = s[num_index4 + len(N_s[3])]
+                    all_4 = find_repeat(s, N_s[4])
+                    temp4 = 0
+                    for ii in range(len(N_s[0])):
+                        if N_s[0][ii] == N_s[4]:
+                            temp4 = temp4 + 1
+                    for jj in range(len(N_s[1])):
+                        if N_s[1][jj] == N_s[4]:
+                            temp4 = temp4 + 1
+                    for kk in range(len(N_s[2])):
+                        if N_s[2][kk] == N_s[4]:
+                            temp4 = temp4 + 1
+                    for ll in range(len(N_s[3])):
+                        if N_s[3][ll] == N_s[4]:
+                            temp4 = temp4 + 1
+                    num_index5 = all_4[temp4]
+                    num_infer5 = s[num_index5 - len(N_s[4])]
+                    num_back5 = s[num_index5 + len(N_s[4])]
+                    if isinstance(N_s[0], str):
+                        N_s[0] = int(N_s[0])
+                    if isinstance(N_s[1], str):
+                        N_s[1] = int(N_s[1])
+                    if isinstance(N_s[2], str):
+                        N_s[2] = int(N_s[2])
+                    if isinstance(N_s[3], str):
+                        N_s[3] = int(N_s[3])
+                    if isinstance(N_s[4], str):
+                        N_s[4] = int(N_s[4])
+                    for l in range(len_keyword_item1):
+                        if C_s.find(keyword_item1[l]) != -1:
+                            for m in range(len_keyword_item2):
+                                if C_s.find(keyword_item2[m]) != -1:
+                                    for n in range(len_keyword_item3):
+                                        if C_s.find(keyword_item3[n]) != -1:
+                                            if (num_back1 == '卷' or num_back1 == '部') and (num_back2 == '.' and num_infer3 == '.') and num_back3 == '分' and num_back4 == '分':  # 第*卷,每题*.*分,共*分,共*题/第卷,共*.*分,每题*分,共*题
+                                                volume_N = N_s[0]
+                                                if N_s[1] > N_s[3]:
+                                                    volume_score = N_s[1]
+                                                    item_score = N_s[3]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[3]
+                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and (num_back3 == '.' and num_infer4 == '.') and num_back2 == '分' and num_back4 == '分':  # 第*卷,每题*分,共*.*分,共*题/第卷,共*分,每题*.*分,共*题
+                                                volume_N = N_s[0]
+                                                if N_s[1] > N_s[2]:
+                                                    volume_score = N_s[1]
+                                                    item_score = float(N_s[2] + '.' + N_s[3])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[2]
+                                                    item_score = N_s[1]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and (num_back3 == '.' and num_infer4 == '.') and num_back4 == '分' and num_back5 == '分':  # 第卷,共*题,共*.*分,每题*分/第卷,共*题,每题*.*分,共*分
+                                                volume_N = N_s[0]
+                                                if N_s[2] > N_s[4]:
+                                                    volume_score = N_s[2]
+                                                    item_score = N_s[4]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[4]
+                                                    item_score = float(N_s[2] + '.' + N_s[3])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and (num_back4 == '.' and num_infer5 == '.') and num_back3 == '分' and num_back5 == '分':  # 第*卷,共*题,共*分,每题*.*分/第卷,共*题,每题*分,共*.*分
+                                                volume_N = N_s[0]
+                                                if N_s[2] > N_s[3]:
+                                                    volume_score = N_s[2]
+                                                    item_score = float(N_s[3] + '.' + N_s[4])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[3]
+                                                    item_score = N_s[2]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and (num_back2 == '.' and num_infer3 == '.') and num_back3 == '分' and num_back5 == '分':  # 第*卷,共*.*分,共*题,每题*分/第*卷,每题*.*分,共*题,共*分
+                                                volume_N = N_s[0]
+                                                if N_s[1] > N_s[4]:
+                                                    volume_score = N_s[1]
+                                                    item_score = N_s[4]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[4]
+                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                            elif (num_back1 == '卷' or num_back1 == '部') and (num_back4 == '.' and num_infer5 == '.') and num_back2 == '分' and num_back5 == '分':  # 第*卷,共*分,共*题,每题*.*分/第卷,每题*分,共*题,共*.*分
+                                                volume_N = N_s[0]
+                                                if N_s[1] > N_s[3]:
+                                                    volume_score = N_s[0]
+                                                    item_score = float(N_s[4] + '.' + N_s[4])
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                                else:
+                                                    volume_score = N_s[3]
+                                                    item_score = N_s[1]
+                                                    item_count = int(volume_score / item_score)
+                                                    type_score_dict_ocr['volume_N'] = volume_N
+                                                    type_score_dict_ocr['volume_total_score'] = volume_score
+                                                    type_score_dict_ocr['volume_count'] = item_count
+                                                    type_score_dict_ocr['volume_score'] = item_score
+                                                    break
+                                    break
+                            break
+                if 'volume_N' not in type_score_dict_ocr.keys():
+                    all_structure = {'volume_structure': -1,
+                                     'Score_structure': -1}
+                    break
+                else:
+                    for i in range(len_keyword_type):
+                        if C_s.find(keyword_type[i]) != -1 and C_s.find('非') != -1:
+                            type_score_dict_ocr['keyword_type'] = keyword_type[1]
+                            break
+                        elif C_s.find(keyword_type[0]) != -1:
+                            type_score_dict_ocr['keyword_type'] = keyword_type[0]
+                            Score_structure_item = type_score_dict_ocr
+                            Score_structure.append(Score_structure_item)
+                            break
+                        elif C_s.find(keyword_type[i]) != -1:
+                            type_score_dict_ocr['keyword_type'] = keyword_type[i]
+                            break
+                        elif i == len_keyword_type - 1:
+                            type_score_dict_ocr['keyword_type'] = keyword_type[0]
+                            Score_structure_item = type_score_dict_ocr
+                            Score_structure.append(Score_structure_item)
+                            break
+                    volume_structure_item = type_score_dict_ocr
+                    volume_structure.append(volume_structure_item)
+                    if Score_structure == []:
+                        all_structure = {'volume_structure': volume_structure,
+                                         'Score_structure': -1}
+                    else:
+                        all_structure = {'volume_structure': volume_structure,
+                                         'Score_structure': Score_structure}
+                    break
+            elif iiii == len(keyword_volume) - 1:
+                '''
+                对应试卷中不存在分卷信息的情况,根据包含数字的个数分为4类,暂定包含信息的有效数字个数小于4,并处理小题分数和总分可能包含小数点的情况
+                暂定小题个数不包含小数
+                暂定总分数中不存在有意义的小数位
+                '''
+                for xxx in range(len_keyword_type):
+                    if C_s.find(keyword_type[xxx]) != -1:
+                        for x in range(len_keyword_item1):
+                            if C_s.find(keyword_item1[x]) != -1:
+                                if len(N_s) == 1:
+                                    num_index = s.index(N_s[0])
+                                    num_infer = s[num_index - len(N_s[0])]
+                                    num_back = s[num_index + len(N_s[0])]
+                                    if isinstance(N_s[0], str):
+                                        N_s[0] = int(N_s[0])
+                                    if num_back == '分':  # 选择题/主观题,共*分
+                                        item_total_score = N_s[0]
+                                        type_score_dict_ocr['volume_N'] = -1
+                                        type_score_dict_ocr['volume_total_score'] = int(item_total_score)
+                                        type_score_dict_ocr['volume_count'] = -1
+                                        type_score_dict_ocr['volume_score'] = -1
+                                    break
+                                elif len(N_s) == 2:
+                                    num_index1 = s.index(N_s[0])
+                                    num_infer1 = s[num_index1 - len(N_s[0])]
+                                    num_back1 = s[num_index1 + len(N_s[0])]
+                                    all_1 = find_repeat(s, N_s[1])
+                                    temp1 = 0
+                                    for ii in range(len(N_s[0])):
+                                        if N_s[0][ii] == N_s[1]:
+                                            temp1 = temp1 + 1
+                                    num_index2 = all_1[temp1]
+                                    num_infer2 = s[num_index2 - len(N_s[1])]
+                                    num_back2 = s[num_index2 + len(N_s[1])]
+                                    if isinstance(N_s[0], str):
+                                        N_s[0] = int(N_s[0])
+                                    if isinstance(N_s[1], str):
+                                        N_s[1] = int(N_s[1])
+                                    for y in range(len_keyword_item2):
+                                        if C_s.find(keyword_item2[y]) != -1:
+                                            if num_back1 == '分' and (num_infer2 == '题' or num_infer2 == '空'):  # 选择题/主观题/客观题,共*分,每题*分
+                                                item_total_score = int(N_s[0])
+                                                item_count = int(N_s[0] / N_s[1])
+                                                item_score = N_s[1]
+                                                type_score_dict_ocr['volume_N'] = -1
+                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif ( num_infer1 == '题' or num_infer1 == '空') and num_back2 == '分':  # 选择题/主观题,每题*分,共*分
+                                                item_total_score = N_s[1]
+                                                item_count = int(N_s[1] / N_s[0])
+                                                item_score = N_s[0]
+                                                type_score_dict_ocr['volume_N'] = -1
+                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                        elif y == len_keyword_item2 - 1:
+                                            for u in range(len_keyword_item3):
+                                                if C_s.find(keyword_item3[u]) != -1:
+                                                    if num_back1 == '分':  # 选择题/主观题,共*分,共*题
+                                                        item_total_score = N_s[0]
+                                                        item_count = N_s[1]
+                                                        item_score = N_s[0] / N_s[1]
+                                                        type_score_dict_ocr['volume_N'] = -1
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif num_back2 == '分':  # 选择题/主观题,共*题,共*分
+                                                        item_total_score = N_s[1]
+                                                        item_count = N_s[0]
+                                                        item_score = N_s[1] / N_s[0]
+                                                        type_score_dict_ocr['volume_N'] = -1
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                elif u == len_keyword_item3 - 1:
+                                                    if num_back1 == '.' and num_infer2 == '.' and num_back2 == '分':   # *.*分
+                                                        item_N = -1
+                                                        item_total_score = N_s[0]
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = -1
+                                                        type_score_dict_ocr['volume_score'] = -1
+                                                        break
+                                                    elif num_back2 == '分':  # *,*分
+                                                        item_N = N_s[0]
+                                                        item_total_score = int(N_s[1])
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = -1
+                                                        type_score_dict_ocr['volume_score'] = -1
+                                                        break
+                                            break
+                                    break
+                                elif len(N_s) == 3:
+                                    num_index1 = s.index(N_s[0])
+                                    num_infer1 = s[num_index1 - len(N_s[0])]
+                                    num_back1 = s[num_index1 + len(N_s[0])]
+                                    all_1 = find_repeat(s, N_s[1])
+                                    temp1 = 0
+                                    for ii in range(len(N_s[0])):
+                                        if N_s[0][ii] == N_s[1]:
+                                            temp1 = temp1 + 1
+                                    num_index2 = all_1[temp1]
+                                    num_infer2 = s[num_index2 - len(N_s[1])]
+                                    num_back2 = s[num_index2 + len(N_s[1])]
+                                    all_2 = find_repeat(s, N_s[2])
+                                    temp2 = 0
+                                    for ii in range(len(N_s[0])):
+                                        if N_s[0][ii] == N_s[2]:
+                                            temp2 = temp2 + 1
+                                    for jj in range(len(N_s[1])):
+                                        if N_s[1][jj] == N_s[2]:
+                                            temp2 = temp2 + 1
+                                    num_index3 = all_2[temp2]
+                                    num_infer3 = s[num_index3 - len(N_s[2])]
+                                    num_back3 = s[num_index3 + len(N_s[2])]
+                                    if isinstance(N_s[0], str):
+                                        N_s[0] = int(N_s[0])
+                                    if isinstance(N_s[1], str):
+                                        N_s[1] = int(N_s[1])
+                                    if isinstance(N_s[2], str):
+                                        N_s[2] = int(N_s[2])
+                                    for v in range(len_keyword_item2):
+                                        if C_s.find(keyword_item2[v]) != -1:
+                                            for w in range(len_keyword_item3):
+                                                if C_s.find(keyword_item3[w]) != -1:
+                                                    if (num_infer1 == '题' or num_infer1 == '空') and num_back1 == '分' and num_back3 == '分':  # 每题*分,共*题,共*分
+                                                        item_total_score = N_s[2]
+                                                        item_count = N_s[1]
+                                                        item_score = N_s[0]
+                                                        if item_total_score < item_count * item_score:
+                                                            item_total_score = item_count * item_score
+                                                        type_score_dict_ocr['volume_N'] = -1
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif (num_infer1 == '题' or num_infer1 == '空') and num_back1 == '分' and num_back2 == '分':  # 每题*分,共*分,共*题
+                                                        item_total_score = N_s[1]
+                                                        item_count = N_s[2]
+                                                        item_score = N_s[0]
+                                                        if item_total_score < item_count * item_score:
+                                                            item_total_score = item_count * item_score
+                                                        type_score_dict_ocr['volume_N'] = -1
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分' and num_back3 == '分':  # 共*题,每题*分,共*分
+                                                        item_total_score = N_s[2]
+                                                        item_count = N_s[0]
+                                                        item_score = N_s[1]
+                                                        if item_total_score < item_count * item_score:
+                                                            item_total_score = item_count * item_score
+                                                        type_score_dict_ocr['volume_N'] = -1
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分' and num_back1 == '分':  # 共*分,每题*分,共*题
+                                                        item_total_score = N_s[0]
+                                                        item_count = N_s[2]
+                                                        item_score = N_s[1]
+                                                        if item_total_score < item_count * item_score:
+                                                            item_total_score = item_count * item_score
+                                                        type_score_dict_ocr['volume_N'] = -1
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分' and num_back2 == '分':  # 共*题,共*分,每题*分
+                                                        item_total_score = N_s[1]
+                                                        item_count = N_s[0]
+                                                        item_score = N_s[2]
+                                                        if item_total_score < item_count * item_score:
+                                                            item_total_score = item_count * item_score
+                                                        type_score_dict_ocr['volume_N'] = -1
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分' and num_back1 == '分':  # 共*分,共*题, 每题*分
+                                                        item_total_score = N_s[0]
+                                                        item_count = N_s[1]
+                                                        item_score = N_s[2]
+                                                        if item_total_score < item_count * item_score:
+                                                            item_total_score = item_count * item_score
+                                                        type_score_dict_ocr['volume_N'] = -1
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                elif w == len_keyword_item3 - 1:
+                                                    if num_back1 != '.' and num_back2 == '分' and (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分':  # *,共*分,每题*分
+                                                        item_N = N_s[0]
+                                                        item_total_score = N_s[1]
+                                                        item_count = int(N_s[1] / N_s[2])
+                                                        item_score = N_s[2]
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif num_back1 != '.' and num_back3 == '分' and (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分':  # *,每题*分,共*分,
+                                                        item_N = N_s[0]
+                                                        item_total_score = N_s[2]
+                                                        item_count = int(N_s[2] / N_s[1])
+                                                        item_score = N_s[1]
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif num_back1 == '.' and num_infer2 == '.' and num_back3 == '分' and num_back2 == '分':  # 每题*.*分,共*分/共*.*分,每题*分
+                                                        item_N = -1
+                                                        if int(N_s[0]) > int(N_s[2]):
+                                                            item_total_score = N_s[0]
+                                                            item_score = N_s[2]
+                                                            item_count = int(item_total_score/item_score)
+                                                        else:
+                                                            item_total_score = N_s[2]
+                                                            item_score = float(N_s[0]+'.'+N_s[1])
+                                                            item_count = int(item_total_score / item_score)
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif num_back2 == '.' and num_infer3 == '.' and num_back1 == '分' and num_back3 == '分':  # 每题*分,共*.*分/共*分,每题*.*分
+                                                        item_N = -1
+                                                        if int(N_s[0]) > int(N_s[1]):
+                                                            item_total_score = N_s[0]
+                                                            item_score = float(N_s[1]+'.'+N_s[2])
+                                                            item_count = int(item_total_score/item_score)
+                                                        else:
+                                                            item_total_score = N_s[1]
+                                                            item_score = N_s[0]
+                                                            item_count = int(item_total_score / item_score)
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                            break
+                                        elif v == len_keyword_item2 - 1:
+                                            for w in range(len_keyword_item3):
+                                                if C_s.find(keyword_item3[w]) != -1:
+                                                    if num_back3 == '分' and num_infer3 =='.' and num_back2 =='.':  # *小题,共*.*分,
+                                                        item_N = -1
+                                                        item_total_score = N_s[1]
+                                                        item_count = N_s[0]
+                                                        item_score = N_s[1]/N_s[0]
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif num_back2 == '分' and num_infer2 =='.'and num_back1 =='.':  # 共*.*分,*小题
+                                                        item_N = -1
+                                                        item_total_score = N_s[0]
+                                                        item_count = N_s[2]
+                                                        item_score = N_s[0]/N_s[2]
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif num_back3 == '分' and num_infer3 !='.':  # *,*小题,共*分,
+                                                        item_N = N_s[0]
+                                                        item_total_score = N_s[2]
+                                                        item_count = N_s[1]
+                                                        item_score = N_s[2]/N_s[1]
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif num_back2 == '分' and num_infer2 !='.':  # *,共*分,共*小题
+                                                        item_N = N_s[0]
+                                                        item_total_score = N_s[1]
+                                                        item_count = N_s[2]
+                                                        item_score = N_s[1] / N_s[2]
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                elif w == len_keyword_item3-1:
+                                                    if num_back3 == '分' and num_infer3 =='.' and num_back2 =='.':  # *,共*.*分,
+                                                        item_N = N_s[0]
+                                                        item_total_score = N_s[1]
+                                                        item_count = -1
+                                                        item_score = -1
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif num_back3 == '分':
+                                                        item_total_score = N_s[1]
+                                                        item_N = -1
+                                                        item_count = -1
+                                                        item_score = -1
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                            break
+                                    break
+                                elif len(N_s) == 4:
+                                    num_index1 = s.index(N_s[0])
+                                    num_infer1 = s[num_index1 - len(N_s[0])]
+                                    num_back1 = s[num_index1 + len(N_s[0])]
+                                    all_1 = find_repeat(s, N_s[1])
+                                    temp1 = 0
+                                    for ii in range(len(N_s[0])):
+                                        if N_s[0][ii] == N_s[1]:
+                                            temp1 = temp1 + 1
+                                    num_index2 = all_1[temp1]
+                                    num_infer2 = s[num_index2 - len(N_s[1])]
+                                    num_back2 = s[num_index2 + len(N_s[1])]
+                                    all_2 = find_repeat(s, N_s[2])
+                                    temp2 = 0
+                                    for ii in range(len(N_s[0])):
+                                        if N_s[0][ii] == N_s[2]:
+                                            temp2 = temp2 + 1
+                                    for jj in range(len(N_s[1])):
+                                        if N_s[1][jj] == N_s[2]:
+                                            temp2 = temp2 + 1
+                                    num_index3 = all_2[temp2]
+                                    num_infer3 = s[num_index3 - len(N_s[2])]
+                                    num_back3 = s[num_index3 + len(N_s[2])]
+                                    all_3 = find_repeat(s, N_s[3])
+                                    temp3 = 0
+                                    for ii in range(len(N_s[0])):
+                                        if N_s[0][ii] == N_s[3]:
+                                            temp3 = temp3 + 1
+                                    for jj in range(len(N_s[1])):
+                                        if N_s[1][jj] == N_s[3]:
+                                            temp3 = temp3 + 1
+                                    for kk in range(len(N_s[2])):
+                                        if N_s[2][kk] == N_s[3]:
+                                            temp3 = temp3 + 1
+                                    num_index4 = all_3[temp3]
+                                    num_infer4 = s[num_index4 - len(N_s[3])]
+                                    num_back4 = s[num_index4 + len(N_s[3])]
+                                    if isinstance(N_s[0], str):
+                                        N_s[0] = int(N_s[0])
+                                    if isinstance(N_s[1], str):
+                                        N_s[1] = int(N_s[1])
+                                    if isinstance(N_s[2], str):
+                                        N_s[2] = int(N_s[2])
+                                    if isinstance(N_s[3], str):
+                                        N_s[3] = int(N_s[3])
+                                    for y in range(len_keyword_item1):
+                                        if C_s.find(keyword_item1[y]) != -1:
+                                            for z in range(len_keyword_item2):
+                                                if C_s.find(keyword_item2[z]) != -1:
+                                                    for u in range(len_keyword_item3):
+                                                        if C_s.find(keyword_item3[u]) != -1:
+                                                            if (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分' and num_back4 == '分':  # *,每题*分,共*题,共*分
+                                                                item_N = N_s[0]
+                                                                item_total_score = N_s[3]
+                                                                item_count = N_s[2]
+                                                                item_score = N_s[1]
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr[
+                                                                    'volume_total_score'] = item_total_score
+                                                                type_score_dict_ocr['volume_count'] = item_count
+                                                                type_score_dict_ocr['volume_score'] = item_score
+                                                                break
+                                                            elif (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分' and num_back3 == '分':  # *,每题*分,共*分,共*题
+                                                                item_N = N_s[0]
+                                                                item_total_score = N_s[2]
+                                                                item_count = N_s[3]
+                                                                item_score = N_s[1]
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                if item_total_score < item_count * item_score:
+                                                                    item_total_score = item_count * item_score
+                                                                type_score_dict_ocr[
+                                                                    'volume_total_score'] = item_total_score
+                                                                type_score_dict_ocr['volume_count'] = item_count
+                                                                type_score_dict_ocr['volume_score'] = item_score
+                                                                break
+                                                            elif (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分' and num_back4 == '分':  # *,共*题,每题*分,共*分
+                                                                item_N = N_s[0]
+                                                                item_total_score = N_s[3]
+                                                                item_count = N_s[1]
+                                                                item_score = N_s[2]
+                                                                if item_total_score < item_count * item_score:
+                                                                    item_total_score = item_count * item_score
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr[
+                                                                    'volume_total_score'] = item_total_score
+                                                                type_score_dict_ocr['volume_count'] = item_count
+                                                                type_score_dict_ocr['volume_score'] = item_score
+                                                                break
+                                                            elif (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分' and num_back2 == '分':  # *,共*分,每题*分,共*题
+                                                                item_N = N_s[0]
+                                                                item_total_score = N_s[1]
+                                                                item_count = N_s[3]
+                                                                item_score = N_s[2]
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr[
+                                                                    'volume_total_score'] = item_total_score
+                                                                type_score_dict_ocr['volume_count'] = item_count
+                                                                type_score_dict_ocr['volume_score'] = item_score
+                                                                break
+                                                            elif (num_infer4 == '题' or num_infer4 == '空') and num_back4 == '分' and num_back3 == '分':  # *,共*题,共*分,每题*分
+                                                                item_N = N_s[0]
+                                                                item_total_score = N_s[2]
+                                                                item_count = N_s[1]
+                                                                item_score = N_s[3]
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                if item_total_score < item_count * item_score:
+                                                                    item_total_score = item_count * item_score
+                                                                type_score_dict_ocr[
+                                                                    'volume_total_score'] = item_total_score
+                                                                type_score_dict_ocr['volume_count'] = item_count
+                                                                type_score_dict_ocr['volume_score'] = item_score
+                                                                break
+                                                            elif (num_infer4 == '题' or num_infer4 == '空') and num_back4 == '分' and num_back2 == '分':  # *,共*分,共*题, 每题*分
+                                                                item_N = N_s[0]
+                                                                item_total_score = N_s[1]
+                                                                item_count = N_s[2]
+                                                                item_score = N_s[3]
+                                                                type_score_dict_ocr['item_N'] = item_N
+                                                                if item_total_score < item_count * item_score:
+                                                                    item_total_score = item_count * item_score
+                                                                type_score_dict_ocr[
+                                                                    'item_total_score'] = item_total_score
+                                                                type_score_dict_ocr['item_count'] = item_count
+                                                                type_score_dict_ocr['item_score'] = item_score
+                                                                break
+                                                            elif num_back1== '.' and num_infer2== '.' and num_back2 == '分'and num_back4 == '分' :  # 共*.*分,共*题, 每题*分/每题*.*分,共*题,共*分
+                                                                item_N = -1
+                                                                if N_s[0] > N_s[3]:
+                                                                    item_total_score = N_s[0]
+                                                                    item_score = N_s[3]
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = N_s[3]
+                                                                    item_score = float(N_s[0] + '.' + N_s[1])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr[ 'item_total_score'] = item_total_score
+                                                                type_score_dict_ocr['item_count'] = item_count
+                                                                type_score_dict_ocr['item_score'] = item_score
+                                                                break
+                                                            elif num_back3== '.' and num_infer4== '.' and num_back1 == '分'and num_back4 == '分' :  # 共*分,共*题, 每题*.*分/每题*分,共*题,共*.*分
+                                                                item_N = -1
+                                                                if N_s[0] > N_s[2]:
+                                                                    item_total_score = N_s[0]
+                                                                    item_score = float(N_s[2] + '.' + N_s[3])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = N_s[2]
+                                                                    item_score = N_s[0]
+                                                                    item_count = int(item_total_score / item_score)
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr[ 'item_total_score'] = item_total_score
+                                                                type_score_dict_ocr['item_count'] = item_count
+                                                                type_score_dict_ocr['item_score'] = item_score
+                                                                break
+                                                            elif num_back2== '.' and num_infer3== '.' and num_back3 == '分'and num_back4 == '分' :  # 共*题,共*.*分,每题*分/共*题,每题*.*分,共*分
+                                                                item_N = -1
+                                                                if N_s[1] > N_s[3]:
+                                                                    item_total_score = N_s[1]
+                                                                    item_score = N_s[3]
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = N_s[3]
+                                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                                type_score_dict_ocr['volume_count'] = item_count
+                                                                type_score_dict_ocr['volume_score'] = item_score
+                                                                break
+                                                            elif num_back3== '.' and num_infer4== '.' and num_back4 == '分'and num_back2 == '分' :  # 共*题,共*分,每题*.*分/共*题,每题*分,共*.*分
+                                                                item_N = -1
+                                                                if N_s[1] > N_s[2]:
+                                                                    item_total_score = N_s[1]
+                                                                    item_score = float(N_s[2] + '.' + N_s[3])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = N_s[2]
+                                                                    item_score = N_s[1]
+                                                                    item_count = int(item_total_score / item_score)
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                                type_score_dict_ocr['volume_count'] = item_count
+                                                                type_score_dict_ocr['volume_score'] = item_score
+                                                                break
+                                                            elif num_back1== '.' and num_infer2== '.' and num_back2 == '分'and num_back3 == '分' :  # 每题*.*分,共*分,共*题/共*.*分,每题*分,共*题
+                                                                item_N = -1
+                                                                if N_s[0] > N_s[2]:
+                                                                    item_total_score = N_s[0]
+                                                                    item_score = N_s[2]
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = N_s[2]
+                                                                    item_score = float(N_s[0] + '.' + N_s[1])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                                type_score_dict_ocr['volume_count'] = item_count
+                                                                type_score_dict_ocr['volume_score'] = item_score
+                                                                break
+                                                            elif num_back2== '.' and num_infer3== '.' and num_back3 == '分'and num_back1 == '分' :  # 每题*分,共*.*分,共*题/共*分,每题*.*分,共*题
+                                                                item_N = -1
+                                                                if N_s[0] > N_s[1]:
+                                                                    item_total_score = N_s[0]
+                                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = N_s[1]
+                                                                    item_score = N_s[0]
+                                                                    item_count = int(item_total_score / item_score)
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                                type_score_dict_ocr['volume_count'] = item_count
+                                                                type_score_dict_ocr['volume_score'] = item_score
+                                                                break
+                                                        elif u == len_keyword_item3-1:
+                                                            if num_back2== '.' and num_infer3== '.' and num_back3 == '分'and num_back4 == '分' :  # *,共*.*分, 每题*分/每题*.*分,共*分
+                                                                item_N = N_s[0]
+                                                                if N_s[1] > N_s[3]:
+                                                                    item_total_score = N_s[1]
+                                                                    item_score = N_s[3]
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = N_s[3]
+                                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr[ 'item_total_score'] = item_total_score
+                                                                type_score_dict_ocr['item_count'] = item_count
+                                                                type_score_dict_ocr['item_score'] = item_score
+                                                                break
+                                                            elif num_back3== '.' and num_infer4== '.' and num_back2 == '分'and num_back4 == '分' :  # *,共*分, 每题*.*分/*,每题*分,共*.*分
+                                                                item_N = int(N_s[0])
+                                                                if N_s[1] > N_s[2]:
+                                                                    item_total_score = N_s[1]
+                                                                    item_score = float(N_s[2] + '.' + N_s[3])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = N_s[2]
+                                                                    item_score = N_s[1]
+                                                                    item_count = int(item_total_score / item_score)
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr[ 'item_total_score'] = item_total_score
+                                                                type_score_dict_ocr['item_count'] = item_count
+                                                                type_score_dict_ocr['item_score'] = item_score
+                                                                break
+                                                            elif num_back2== '.' and num_infer3== '.' and num_back3 == '分'and num_back4 == '分' :  # *,共*.*分,每题*分/*,每题*.*分,共*分
+                                                                item_N = N_s[0]
+                                                                if N_s[1] > N_s[3]:
+                                                                    item_total_score = N_s[1]
+                                                                    item_score = N_s[3]
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = N_s[3]
+                                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                                type_score_dict_ocr['volume_count'] = item_count
+                                                                type_score_dict_ocr['volume_score'] = item_score
+                                                                break
+                                                            elif num_back3== '.' and num_infer4== '.' and num_back4 == '分'and num_back2 == '分' :  # *,共*分,每题*.*分/*,每题*分,共*.*分
+                                                                item_N = N_s[0]
+                                                                if N_s[1] > N_s[2]:
+                                                                    item_total_score = N_s[1]
+                                                                    item_score = float(N_s[2] + '.' + N_s[3])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = N_s[2]
+                                                                    item_score = N_s[1]
+                                                                    item_count = int(item_total_score / item_score)
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                                type_score_dict_ocr['volume_count'] = item_count
+                                                                type_score_dict_ocr['volume_score'] = item_score
+                                                                break
+                                                            elif num_back1== '.' and num_infer2== '.' and num_back2 == '分'and num_back3 == '分' :  # *,每题*.*分,共*分/*,共*.*分,每题*分
+                                                                item_N = N_s[0]
+                                                                if N_s[1] > N_s[3]:
+                                                                    item_total_score = N_s[1]
+                                                                    item_score = N_s[3]
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = N_s[3]
+                                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                                type_score_dict_ocr['volume_count'] = item_count
+                                                                type_score_dict_ocr['volume_score'] = item_score
+                                                                break
+                                                            elif num_back2== '.' and num_infer3== '.' and num_back3 == '分'and num_back1 == '分' :  # *,每题*分,共*.*分/*,共*分,每题*.*分
+                                                                item_N = N_s[0]
+                                                                if N_s[1] > N_s[2]:
+                                                                    item_total_score = N_s[1]
+                                                                    item_score = float(N_s[2] + '.' + N_s[3])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = N_s[2]
+                                                                    item_score = N_s[1]
+                                                                    item_count = int(item_total_score / item_score)
+                                                                type_score_dict_ocr['volume_N'] = item_N
+                                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                                type_score_dict_ocr['volume_count'] = item_count
+                                                                type_score_dict_ocr['volume_score'] = item_score
+                                                                break
+
+                                                    break
+                                            break
+                                        elif y == len_keyword_item1 - 1 and num_back4 == '分':
+                                            item_total_score = N_s[3]
+                                            item_N = -1
+                                            item_score = -1
+                                            item_count = -1
+                                            type_score_dict_ocr['volume_N'] = item_N
+                                            type_score_dict_ocr['volume_total_score'] = item_total_score
+                                            type_score_dict_ocr['volume_count'] = item_count
+                                            type_score_dict_ocr['volume_score'] = item_score
+                                            break
+                                    break
+                                break
+                            elif x == len_keyword_item1 - 1:
+                                for y in range(len_keyword_item2):
+                                    if C_s.find(keyword_item2[y]) != -1:
+                                        if len(N_s) == 1:
+                                            num_index1 = s.index(N_s[0])
+                                            num_infer1 = s[num_index1 - len(N_s[0])]
+                                            num_back1 = s[num_index1 + len(N_s[0])]
+                                            if isinstance(N_s[0], str):
+                                                N_s[0] = int(N_s[0])
+                                            if num_back1 == '分':  # 每题*分
+                                                item_score = N_s[0]
+                                                type_score_dict_ocr['volume_N'] = -1
+                                                type_score_dict_ocr['volume_total_score'] = -1
+                                                type_score_dict_ocr['volume_count'] = -1
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                            break
+                                        if len(N_s) == 2:
+                                            num_index1 = s.index(N_s[0])
+                                            num_infer1 = s[num_index1 - len(N_s[0])]
+                                            num_back1 = s[num_index1 + len(N_s[0])]
+                                            all_1 = find_repeat(s, N_s[1])
+                                            temp1 = 0
+                                            for ii in range(len(N_s[0])):
+                                                if N_s[0][ii] == N_s[1]:
+                                                    temp1 = temp1 + 1
+                                            num_index2 = all_1[temp1]
+                                            num_infer2 = s[num_index2 - len(N_s[1])]
+                                            num_back2 = s[num_index2 + len(N_s[1])]
+                                            if isinstance(N_s[0], str):
+                                                N_s[0] = int(N_s[0])
+                                            if isinstance(N_s[1], str):
+                                                N_s[1] = int(N_s[1])
+                                            for z in range(len(keyword_item3)):
+                                                if C_s.find(keyword_item3[z]) != -1:
+                                                    if num_back2 == '分':  # 共*题,每题*分
+                                                        item_total_score = N_s[0] * N_s[1]
+                                                        item_count = N_s[0]
+                                                        item_score = N_s[1]
+                                                        type_score_dict_ocr['volume_N'] = -1
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif num_back1 == '分':  # 每题*分,共*题
+                                                        item_total_score = int(N_s[0]) * int(N_s[1])
+                                                        item_count = int(N_s[1])
+                                                        item_score = int(N_s[0])
+                                                        type_score_dict_ocr['volume_N'] = -1
+                                                        type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                        type_score_dict_ocr['volume_count'] = item_count
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                elif z == len(keyword_item3) - 1:
+                                                    if num_back2 == '分' and num_back1 == '.' and num_infer2 == '.':  # *.*分
+                                                        item_N = -1
+                                                        item_score = float(N_s[0]+'.'+N_s[1])
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = -1
+                                                        type_score_dict_ocr['volume_count'] = -1
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                                    elif num_back2 == '分':  # *,*分
+                                                        item_N = int(N_s[0])
+                                                        item_score = int(N_s[1])
+                                                        type_score_dict_ocr['volume_N'] = item_N
+                                                        type_score_dict_ocr['volume_total_score'] = -1
+                                                        type_score_dict_ocr['volume_count'] = -1
+                                                        type_score_dict_ocr['volume_score'] = item_score
+                                                        break
+                                            break
+                                        if len(N_s) == 3:
+                                            num_index1 = s.index(N_s[0])
+                                            num_infer1 = s[num_index1 - len(N_s[0])]
+                                            num_back1 = s[num_index1 + len(N_s[0])]
+                                            all_1 = find_repeat(s, N_s[1])
+                                            temp1 = 0
+                                            for ii in range(len(N_s[0])):
+                                                if N_s[0][ii] == N_s[1]:
+                                                    temp1 = temp1 + 1
+                                            num_index2 = all_1[temp1]
+                                            num_infer2 = s[num_index2 - len(N_s[1])]
+                                            num_back2 = s[num_index2 + len(N_s[1])]
+                                            all_2 = find_repeat(s, N_s[2])
+                                            temp2 = 0
+                                            for ii in range(len(N_s[0])):
+                                                if N_s[0][ii] == N_s[2]:
+                                                    temp2 = temp2 + 1
+                                            for jj in range(len(N_s[1])):
+                                                if N_s[1][jj] == N_s[2]:
+                                                    temp2 = temp2 + 1
+                                            num_index3 = all_2[temp2]
+                                            num_infer3 = s[num_index3 - len(N_s[2])]
+                                            num_back3 = s[num_index3 + len(N_s[2])]
+                                            if isinstance(N_s[0], str):
+                                                N_s[0] = int(N_s[0])
+                                            if isinstance(N_s[1], str):
+                                                N_s[1] = int(N_s[1])
+                                            if isinstance(N_s[2], str):
+                                                N_s[2] = int(N_s[2])
+                                            if num_back3 == '分' and (num_back2 == '题' or num_back2 == '小' or num_back2 == '空') and num_back1 != '分':  # *,共*题,每题*分
+                                                item_N = int(N_s[0])
+                                                item_total_score = int(N_s[1]) * int(N_s[2])
+                                                item_count = int(N_s[1])
+                                                item_score = int(N_s[2])
+                                                type_score_dict_ocr['volume_N'] = item_N
+                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif num_back2 == '分' and (num_back3 == '题' or num_back3 == '小' or num_back3 == '空') and num_back1 != '分':  # *,每题*分,共*题
+                                                item_N = int(N_s[0])
+                                                item_total_score = int(N_s[1]) * int(N_s[2])
+                                                item_count = int(N_s[2])
+                                                item_score = int(N_s[1])
+                                                type_score_dict_ocr['volume_N'] = item_N
+                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif num_infer3 == '.' and num_back3 == '分' and num_back2 == '.':  # 共*题,每题*.*分
+                                                item_N = -1
+                                                item_count = int(N_s[0])
+                                                item_score = float(N_s[1]+'.'+N_s[2])
+                                                item_total_score = int(item_count * item_score)
+                                                type_score_dict_ocr['volume_N'] = item_N
+                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif num_infer2 == '.' and num_back2 == '分' and num_back1 == '.' :  # 每题*.*分,共*题
+                                                item_N = -1
+                                                item_count = int(N_s[2])
+                                                item_score = float(N_s[0] + '.' + N_s[1])
+                                                item_total_score = int(item_count * item_score)
+                                                type_score_dict_ocr['volume_N'] = item_N
+                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+                                            elif num_back3 == '分': # * * ,每题*分
+                                                item_N = -1
+                                                item_count = -1
+                                                item_score = -1
+                                                item_total_score = int(N_s[2])
+                                                type_score_dict_ocr['volume_N'] = item_N
+                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                                break
+
+                                            break
+                                        if len(N_s) == 4:
+                                            num_index1 = s.index(N_s[0])
+                                            num_infer1 = s[num_index1 - len(N_s[0])]
+                                            num_back1 = s[num_index1 + len(N_s[0])]
+                                            all_1 = find_repeat(s, N_s[1])
+                                            temp1 = 0
+                                            for ii in range(len(N_s[0])):
+                                                if N_s[0][ii] == N_s[1]:
+                                                    temp1 = temp1 + 1
+                                            num_index2 = all_1[temp1]
+                                            num_infer2 = s[num_index2 - len(N_s[1])]
+                                            num_back2 = s[num_index2 + len(N_s[1])]
+                                            all_2 = find_repeat(s, N_s[2])
+                                            temp2 = 0
+                                            for ii in range(len(N_s[0])):
+                                                if N_s[0][ii] == N_s[2]:
+                                                    temp2 = temp2 + 1
+                                            for jj in range(len(N_s[1])):
+                                                if N_s[1][jj] == N_s[2]:
+                                                    temp2 = temp2 + 1
+                                            num_index3 = all_2[temp2]
+                                            num_infer3 = s[num_index3 - len(N_s[2])]
+                                            num_back3 = s[num_index3 + len(N_s[2])]
+                                            all_3 = find_repeat(s, N_s[3])
+                                            temp3 = 0
+                                            for ii in range(len(N_s[0])):
+                                                if N_s[0][ii] == N_s[3]:
+                                                    temp3 = temp3 + 1
+                                            for jj in range(len(N_s[1])):
+                                                if N_s[1][jj] == N_s[3]:
+                                                    temp3 = temp3 + 1
+                                            num_index4 = all_3[temp3]
+                                            num_infer4 = s[num_index4 - len(N_s[3])]
+                                            num_back4 = s[num_index4 + len(N_s[3])]
+                                            if isinstance(N_s[0], str):
+                                                N_s[0] = int(N_s[0])
+                                            if isinstance(N_s[1], str):
+                                                N_s[1] = int(N_s[1])
+                                            if isinstance(N_s[2], str):
+                                                N_s[2] = int(N_s[2])
+                                            if isinstance(N_s[3], str):
+                                                N_s[3] = int(N_s[3])
+                                            if num_back3 == '.' and num_infer4 == '.' and num_back4 == '分' and (num_back2 == '题' or num_back2 == '小' or num_back2 == '空') and num_back1 != '分':  # *,共*题,每题*.*分
+                                                item_N = int(N_s[0])
+                                                item_count = int(N_s[1])
+                                                item_score = float(N_s[2] + '.' + N_s[3])
+                                                item_total_score = int(item_count * item_score)
+                                                type_score_dict_ocr['volume_N'] = item_N
+                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                            elif num_back2 == '.' and num_infer3 == '.' and num_back3 == '分' and (num_back4 == '题' or num_back4 == '小' or num_back4 == '空') and num_back1 != '分':  # *,每题*.*分,共*题
+                                                item_N = int(N_s[0])
+                                                item_count = int(N_s[3])
+                                                item_score = float(N_s[1] + '.' + N_s[2])
+                                                item_total_score = int(item_count * item_score)
+                                                type_score_dict_ocr['volume_N'] = item_N
+                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                type_score_dict_ocr['volume_count'] = item_count
+                                                type_score_dict_ocr['volume_score'] = item_score
+                                            break
+                                    elif y == len_keyword_item2 - 1:
+                                        if C_s.find(keyword_item4[0]) != -1:
+                                            if len(N_s) == 2:  # *,*分
+                                                num_index1 = s.index(N_s[0])
+                                                num_infer1 = s[num_index1 - len(N_s[0])]
+                                                num_back1 = s[num_index1 + len(N_s[0])]
+                                                all_1 = find_repeat(s, N_s[1])
+                                                temp1 = 0
+                                                for ii in range(len(N_s[0])):
+                                                    if N_s[0][ii] == N_s[1]:
+                                                        temp1 = temp1 + 1
+                                                num_index2 = all_1[temp1]
+                                                num_infer2 = s[num_index2 - len(N_s[1])]
+                                                num_back2 = s[num_index2 + len(N_s[1])]
+                                                if isinstance(N_s[0], str):
+                                                    N_s[0] = int(N_s[0])
+                                                if isinstance(N_s[1], str):
+                                                    N_s[1] = int(N_s[1])
+                                                if num_back1 == '分':
+                                                    item_N = -1
+                                                    item_total_score = int(N_s[0])
+                                                    type_score_dict_ocr['volume_N'] = item_N
+                                                    type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                    type_score_dict_ocr['volume_count'] = -1
+                                                    type_score_dict_ocr['volume_score'] = -1
+                                                elif num_back2 == '分':
+                                                    item_N = int(N_s[0])
+                                                    item_total_score = int(N_s[1])
+                                                    type_score_dict_ocr['volume_N'] = item_N
+                                                    type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                    type_score_dict_ocr['volume_count'] = -1
+                                                    type_score_dict_ocr['volume_score'] = -1
+                                                break
+                                            elif len(N_s) == 1:
+                                                num_index1 = s.index(N_s[0])
+                                                num_infer1 = s[num_index1 - len(N_s[0])]
+                                                num_back1 = s[num_index1 + len(N_s[0])]
+                                                item_total_score = int(N_s[0])
+                                                type_score_dict_ocr['volume_N'] = -1
+                                                type_score_dict_ocr['volume_total_score'] = item_total_score
+                                                type_score_dict_ocr['volume_count'] = -1
+                                                type_score_dict_ocr['volume_score'] = -1
+                                                # if num_back1 == '分':  # *分
+                                                #     test_9_23=0
+                                                break
+                                break
+                            else:
+                                type_score_dict_ocr['volume_N'] = -1
+                                type_score_dict_ocr['volume_total_score'] = -1
+                                type_score_dict_ocr['volume_count'] = -1
+                                type_score_dict_ocr['volume_score'] = -1
+
+                        if 'volume_N' not in type_score_dict_ocr.keys():
+                            all_structure = {'volume_structure': -1,
+                                             'Score_structure': -1}
+                            break
+                        else:
+                            if C_s.find(keyword_type[1]) != -1:
+                                type_score_dict_ocr['keyword_type'] = keyword_type[1]
+                            elif C_s.find(keyword_type[0]) != -1:
+                                type_score_dict_ocr['keyword_type'] = keyword_type[0]
+                                Score_structure_item = type_score_dict_ocr
+                                Score_structure.append(Score_structure_item)
+                            elif C_s.find(keyword_type[xxx]) != -1:
+                                type_score_dict_ocr['keyword_type'] = keyword_type[xxx]
+                            elif xxx == len_keyword_type - 1:
+                                type_score_dict_ocr['keyword_type'] = -2
+                                type_score_dict_ocr['item_N'] = type_score_dict_ocr.pop(
+                                    'volume_N')
+                                type_score_dict_ocr[
+                                    'item_total_score'] = type_score_dict_ocr.pop(
+                                    'volume_total_score')
+                                type_score_dict_ocr['item_count'] = type_score_dict_ocr.pop(
+                                    'volume_count')
+                                type_score_dict_ocr['item_score'] = type_score_dict_ocr.pop(
+                                    'volume_score')
+                                Score_structure_item = type_score_dict_ocr
+                                Score_structure.append(Score_structure_item)
+                            volume_structure_item = type_score_dict_ocr
+                            volume_structure.append(volume_structure_item)
+                            if Score_structure == []:
+                                all_structure = {'volume_structure': volume_structure,
+                                                 'Score_structure': -1}
+                            elif Score_structure[0]['keyword_type'] != -2:
+                                all_structure = {'volume_structure': volume_structure,
+                                                 'Score_structure': Score_structure}
+                            else:
+                                all_structure = {'volume_structure': -1,
+                                                 'Score_structure': Score_structure}
+                            break
+
+                    elif xxx == len_keyword_type - 1:
+                        for x in range(len_keyword_item1):
+                            if C_s.find(keyword_item1[x]) != -1:
+                                if len(N_s) == 1:
+                                    num_index = s.index(N_s[0])
+                                    num_infer = s[num_index - len(N_s[0])]
+                                    num_back = s[num_index + len(N_s[0])]
+                                    if isinstance(N_s[0], str):
+                                        N_s[0] = int(N_s[0])
+                                    if num_back == '分':  # 共*分
+                                        item_total_score = N_s[0]
+                                        type_score_dict_ocr['item_N'] = -1
+                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                        type_score_dict_ocr['item_count'] = -1
+                                        type_score_dict_ocr['item_score'] = -1
+                                        Score_structure_item = type_score_dict_ocr
+                                        Score_structure.append(Score_structure_item)
+                                        all_structure = {'volume_structure': -1,
+                                                         'Score_structure': Score_structure}
+                                        break
+                                    break
+                                elif len(N_s) == 2:
+                                    num_index1 = s.index(N_s[0])
+                                    num_infer1 = s[num_index1 - len(N_s[0])]
+                                    num_back1 = s[num_index1 + len(N_s[0])]
+                                    all_1 = find_repeat(s, N_s[1])
+                                    temp1 = 0
+                                    for ii in range(len(N_s[0])):
+                                        if N_s[0][ii] == N_s[1]:
+                                            temp1 = temp1 + 1
+                                    num_index2 = all_1[temp1]
+                                    num_infer2 = s[num_index2 - len(N_s[1])]
+                                    num_back2 = s[num_index2 + len(N_s[1])]
+                                    if isinstance(N_s[0], str):
+                                        N_s[0] = int(N_s[0])
+                                    if isinstance(N_s[1], str):
+                                        N_s[1] = int(N_s[1])
+                                    for y in range(len_keyword_item2):
+                                        if C_s.find(keyword_item2[y]) != -1:
+                                            if num_back1 == '分' and (num_infer2 == '题' or num_infer2 == '空'):  # 共*分,每题*分
+                                                item_total_score = int(N_s[0])
+                                                item_count = int(N_s[0]) / int(N_s[1])
+                                                item_score = int(N_s[1])
+                                                type_score_dict_ocr['item_N'] = -1
+                                                type_score_dict_ocr['item_total_score'] = item_total_score
+                                                type_score_dict_ocr['item_count'] = item_count
+                                                type_score_dict_ocr['item_score'] = item_score
+                                                Score_structure_item = type_score_dict_ocr
+                                                Score_structure.append(Score_structure_item)
+                                                all_structure = {'volume_structure': -1,
+                                                                 'Score_structure': Score_structure}
+                                                break
+                                            elif (num_infer1 == '题' or num_infer1 == '空') and num_back2 == '分':  # 每题*分,共*分
+                                                item_total_score = int(N_s[1])
+                                                item_count = int(N_s[1]) / int(N_s[0])
+                                                item_score = int(N_s[0])
+                                                type_score_dict_ocr['item_N'] = -1
+                                                type_score_dict_ocr['item_total_score'] = item_total_score
+                                                type_score_dict_ocr['item_count'] = item_count
+                                                type_score_dict_ocr['item_score'] = item_score
+                                                Score_structure_item = type_score_dict_ocr
+                                                Score_structure.append(Score_structure_item)
+                                                all_structure = {'volume_structure': -1,
+                                                                 'Score_structure': Score_structure}
+                                                break
+                                            break
+                                        elif y == len_keyword_item2 - 1:
+                                            for u in range(len_keyword_item3):
+                                                if C_s.find(keyword_item3[u]) != -1:
+                                                    if num_back1 == '分':  # 共*分,共*题
+                                                        item_total_score = int(N_s[0])
+                                                        item_count = int(N_s[1])
+                                                        item_score = int(N_s[0]) / int(N_s[1])
+                                                        type_score_dict_ocr['item_N'] = -1
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif num_back2 == '分':  # 共*题,共*分
+                                                        item_total_score = int(N_s[1])
+                                                        item_count = int(N_s[0])
+                                                        item_score = int(N_s[1]) / int(N_s[0])
+                                                        type_score_dict_ocr['item_N'] = -1
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                elif u == len_keyword_item3 - 1:
+                                                    if num_back2 == '分' and num_infer2 == '.'and num_back1 == '.':  # *.*分
+                                                        item_N = -1
+                                                        item_total_score = int(N_s[0])
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = -1
+                                                        type_score_dict_ocr['item_score'] = -1
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif num_back2 == '分':
+                                                        item_N = int(N_s[0])
+                                                        item_total_score = int(N_s[1])
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = -1
+                                                        type_score_dict_ocr['item_score'] = -1
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+
+                                    break
+                                elif len(N_s) == 3:
+                                    num_index1 = s.index(N_s[0])
+                                    num_infer1 = s[num_index1 - len(N_s[0])]
+                                    num_back1 = s[num_index1 + len(N_s[0])]
+                                    all_1 = find_repeat(s, N_s[1])
+                                    temp1 = 0
+                                    for ii in range(len(N_s[0])):
+                                        if N_s[0][ii] == N_s[1]:
+                                            temp1 = temp1 + 1
+                                    num_index2 = all_1[temp1]
+                                    num_infer2 = s[num_index2 - len(N_s[1])]
+                                    num_back2 = s[num_index2 + len(N_s[1])]
+                                    all_2 = find_repeat(s, N_s[2])
+                                    temp2 = 0
+
+                                    if len(N_s[0]) == len(N_s[2]):
+                                        if N_s[0] == N_s[2]:
+                                            temp2 = temp2 + 1
+                                    else:
+                                        for ii in range(len(N_s[0])):
+                                            if N_s[0][ii] == N_s[2]:
+                                                temp2 = temp2 + 1
+                                    if len(N_s[1]) == len(N_s[2]):
+                                        if N_s[1] == N_s[2]:
+                                            temp2 = temp2 + 1
+                                    else:
+                                        for jj in range(len(N_s[1])):
+                                            if N_s[1][jj] == N_s[2]:
+                                                temp2 = temp2 + 1
+                                    num_index3 = all_2[temp2]
+                                    num_infer3 = s[num_index3 - len(N_s[2])]
+                                    if num_index3 + len(N_s[2]) < len(s):
+                                        num_back3 = s[num_index3 + len(N_s[2])]
+                                    else:
+                                        num_back3 = []
+                                    if isinstance(N_s[0], str):
+                                        N_s[0] = int(N_s[0])
+                                    if isinstance(N_s[1], str):
+                                        N_s[1] = int(N_s[1])
+                                    if isinstance(N_s[2], str):
+                                        N_s[2] = int(N_s[2])
+                                    for v in range(len_keyword_item2):
+                                        if C_s.find(keyword_item2[v]) != -1:
+                                            for w in range(len_keyword_item3):
+                                                if C_s.find(keyword_item3[w]) != -1:
+                                                    if (num_infer1 == '题' or num_infer1 == '空') and num_back1 == '分' and num_back3 == '分':  # 每题*分,共*题,共*分
+                                                        item_total_score = int(N_s[2])
+                                                        item_count = int(N_s[1])
+                                                        item_score = int(N_s[0])
+                                                        if item_total_score < item_count * item_score:
+                                                            item_total_score = item_count * item_score
+                                                        type_score_dict_ocr['item_N'] = -1
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif (num_infer1 == '题' or num_infer1 == '空') and num_back1 == '分' and num_back2 == '分':  # 每题*分,共*分,共*题
+                                                        item_total_score = int(N_s[1])
+                                                        item_count = int(N_s[2])
+                                                        item_score = int(N_s[0])
+                                                        if item_total_score < item_count * item_score:
+                                                            item_total_score = item_count * item_score
+                                                        type_score_dict_ocr['item_N'] = -1
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分' and num_back3 == '分':  # 共*题,每题*分,共*分
+                                                        item_total_score = int(N_s[2])
+                                                        item_count = int(N_s[0])
+                                                        item_score = int(N_s[1])
+                                                        if item_total_score < item_count * item_score:
+                                                            item_total_score = item_count * item_score
+                                                        type_score_dict_ocr['item_N'] = -1
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分' and num_back1 == '分':  # 共*分,每题*分,共*题
+                                                        item_total_score = int(N_s[0])
+                                                        item_count = int(N_s[2])
+                                                        item_score = int(N_s[1])
+                                                        if item_total_score < item_count * item_score:
+                                                            item_total_score = item_count * item_score
+                                                        type_score_dict_ocr['item_N'] = -1
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分' and num_back2 == '分':  # 共*题,共*分,每题*分
+                                                        item_total_score = int(N_s[1])
+                                                        item_count = int(N_s[0])
+                                                        item_score = int(N_s[2])
+                                                        if item_total_score < item_count * item_score:
+                                                            item_total_score = item_count * item_score
+                                                        type_score_dict_ocr['item_N'] = -1
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分' and num_back1 == '分':  # 共*分,共*题, 每题*分
+                                                        item_total_score = int(N_s[0])
+                                                        item_count = int(N_s[1])
+                                                        item_score = int(N_s[2])
+                                                        if item_total_score < item_count * item_score:
+                                                            item_total_score = item_count * item_score
+                                                        type_score_dict_ocr['item_N'] = -1
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                elif w == len_keyword_item3 - 1:
+                                                    if num_back2 == '分' and (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分':  # *,共*分,每题*分
+                                                        item_N = int(N_s[0])
+                                                        item_total_score = int(N_s[1])
+                                                        item_count = int(N_s[1]) / int(N_s[2])
+                                                        item_score = int(N_s[2])
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif num_back3 == '分' and (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分':  # *,每题*分,共*分,
+                                                        item_N = int(N_s[0])
+                                                        item_total_score = int(N_s[2])
+                                                        item_count = int(N_s[2]) / int(N_s[1])
+                                                        item_score = int(N_s[1])
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif num_back3 == '分' and num_infer2 == '.' and num_back1 == '.' and num_back2 == '分':  # 每题*.*分,共*分/共*.*分,每题*分
+                                                        item_N = -1
+                                                        if int(N_s[0]) > int(N_s[2]):
+                                                            item_total_score = int(N_s[0])
+                                                            item_score = int(N_s[2])
+                                                            item_count = int(item_total_score / item_score)
+                                                        else:
+                                                            item_total_score = int(N_s[2])
+                                                            item_score = float(N_s[0] + '.' + N_s[1])
+                                                            item_count = int(item_total_score / item_score)
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif num_back3 == '分' and num_infer3 == '.' and num_back2 == '.' and num_back1 == '分':  # 每题*分,共*.*分/共*分,每题*.*分
+                                                        item_N = -1
+                                                        if int(N_s[0]) > int(N_s[2]):
+                                                            item_total_score = int(N_s[0])
+                                                            item_score = float(N_s[1] + '.' + N_s[2])
+                                                            item_count = int(item_total_score / item_score)
+                                                        else:
+                                                            item_total_score = int(N_s[1])
+                                                            item_score = int(N_s[0])
+                                                            item_count = int(item_total_score / item_score)
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                            break
+                                        elif v == len_keyword_item2 - 1:
+                                            for w in range(len_keyword_item3):
+                                                if C_s.find(keyword_item3[w]) != -1:
+                                                    if num_back3 == '分' and num_infer3 == '.'and num_back2 == '.':  # *小题,共*.*分,
+                                                        item_N = -1
+                                                        item_total_score = int(N_s[1])
+                                                        item_count = int(N_s[0])
+                                                        item_score = int(N_s[1]) / int(N_s[0])
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif num_back2 == '分' and num_infer2 == '.'and num_back1 == '.':  # 共*.*分,*小题
+                                                        item_N = -1
+                                                        item_total_score = int(N_s[0])
+                                                        item_count = int(N_s[1])
+                                                        item_score = int(N_s[0]) / int(N_s[2])
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif num_back2 == '分':  # *,*小题,共*分,
+                                                        item_N = int(N_s[0])
+                                                        item_total_score = int(N_s[1])
+                                                        item_count = int(N_s[2])
+                                                        item_score = int(N_s[1]) / int(N_s[2])
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif num_back3 == '分':  # *,共*分,共*小题
+                                                        item_N = int(N_s[0])
+                                                        item_total_score = int(N_s[2])
+                                                        item_count = int(N_s[1])
+                                                        item_score = int(N_s[2]) / int(N_s[1])
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                elif w == len_keyword_item3 - 1:
+                                                    if num_back3 == '分' and num_infer3 == '.'and num_back2 == '.':  # *,共*.*分,
+                                                        item_N = int(N_s[0])
+                                                        item_total_score = int(N_s[1])
+                                                        item_count = -1
+                                                        item_score = -1
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                            break
+                                    break
+                                elif len(N_s) == 4:
+                                    num_index1 = s.index(N_s[0])
+                                    num_infer1 = s[num_index1 - len(N_s[0])]
+                                    num_back1 = s[num_index1 + len(N_s[0])]
+                                    all_1 = find_repeat(s, N_s[1])
+                                    temp1 = 0
+                                    for ii in range(len(N_s[0])):
+                                        if N_s[0][ii] == N_s[1]:
+                                            temp1 = temp1 + 1
+                                    num_index2 = all_1[temp1]
+                                    num_infer2 = s[num_index2 - len(N_s[1])]
+                                    num_back2 = s[num_index2 + len(N_s[1])]
+                                    all_2 = find_repeat(s, N_s[2])
+                                    temp2 = 0
+                                    for ii in range(len(N_s[0])):
+                                        if N_s[0][ii] == N_s[2]:
+                                            temp2 = temp2 + 1
+                                    for jj in range(len(N_s[1])):
+                                        if N_s[1][jj] == N_s[2]:
+                                            temp2 = temp2 + 1
+                                    num_index3 = all_2[temp2]
+                                    num_infer3 = s[num_index3 - len(N_s[2])]
+                                    num_back3 = s[num_index3 + len(N_s[2])]
+                                    all_3 = find_repeat(s, N_s[3])
+                                    temp3 = 0
+                                    for ii in range(len(N_s[0])):
+                                        if N_s[0][ii] == N_s[3]:
+                                            temp3 = temp3 + 1
+                                    for jj in range(len(N_s[1])):
+                                        if N_s[1][jj] == N_s[3]:
+                                            temp3 = temp3 + 1
+                                    for kk in range(len(N_s[2])):
+                                        if N_s[2][kk] == N_s[3]:
+                                            temp3 = temp3 + 1
+                                    num_index4 = all_3[temp3]
+                                    num_infer4 = s[num_index4 - len(N_s[3])]
+                                    num_back4 = s[num_index4 + len(N_s[3])]
+                                    if isinstance(N_s[0], str):
+                                        N_s[0] = int(N_s[0])
+                                    if isinstance(N_s[1], str):
+                                        N_s[1] = int(N_s[1])
+                                    if isinstance(N_s[2], str):
+                                        N_s[2] = int(N_s[2])
+                                    if isinstance(N_s[3], str):
+                                        N_s[3] = int(N_s[3])
+                                    for y in range(len_keyword_item1):
+                                        if C_s.find(keyword_item1[y]) != -1:
+                                            for z in range(len_keyword_item2):
+                                                if C_s.find(keyword_item2[z]) != -1:
+                                                    for u in range(len_keyword_item3):
+                                                        if C_s.find(keyword_item3[u]) != -1:
+                                                            if (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分' and num_back4 == '分':  # *,每题*分,共*题,共*分
+                                                                item_N = int(N_s[0])
+                                                                item_total_score = int(N_s[3])
+                                                                item_count = int(N_s[2])
+                                                                item_score = int(N_s[1])
+                                                            elif (num_infer2 == '题' or num_infer2 == '空') and num_back2 == '分' and num_back3 == '分':  # *,每题*分,共*分,共*题
+                                                                item_N = int(N_s[0])
+                                                                item_total_score = int(N_s[2])
+                                                                item_count = int(N_s[3])
+                                                                item_score = int(N_s[1])
+                                                            elif (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分' and num_back4 == '分':  # *,共*题,每题*分,共*分
+                                                                item_N = int(N_s[0])
+                                                                item_total_score = int(N_s[3])
+                                                                item_count = int(N_s[1])
+                                                                item_score = int(N_s[2])
+                                                            elif (num_infer3 == '题' or num_infer3 == '空') and num_back3 == '分' and num_back2 == '分':  # *,共*分,每题*分,共*题
+                                                                item_N = int(N_s[0])
+                                                                item_total_score = int(N_s[1])
+                                                                item_count = int(N_s[3])
+                                                                item_score = int(N_s[2])
+                                                            elif (num_infer4 == '题' or num_infer4 == '空') and num_back4 == '分' and num_back3 == '分':  # *,共*题,共*分,每题*分
+                                                                item_N = int(N_s[0])
+                                                                item_total_score = int(N_s[2])
+                                                                item_count = int(N_s[1])
+                                                                item_score = int(N_s[3])
+                                                                type_score_dict_ocr['item_N'] = item_N
+                                                            elif (num_infer4 == '题' or num_infer4 == '空') and num_back4 == '分' and num_back2 == '分':  # *,共*分,共*题, 每题*分
+                                                                item_N = int(N_s[0])
+                                                                item_total_score = int(N_s[1])
+                                                                item_count = int(N_s[2])
+                                                                item_score = int(N_s[3])
+
+                                                            elif num_back1== '.' and num_infer2== '.' and num_back2 == '分'and num_back4 == '分' :  # 共*.*分,共*题, 每题*分/每题*.*分,共*题,共*分
+                                                                item_N = -1
+                                                                if int(N_s[0]) > int(N_s[3]):
+                                                                    item_total_score = int(N_s[0])
+                                                                    item_score = int(N_s[3])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = int(N_s[3])
+                                                                    item_score = float(N_s[0] + '.' + N_s[1])
+                                                                    item_count = int(item_total_score / item_score)
+                                                            elif num_back3== '.' and num_infer4== '.' and num_back1 == '分'and num_back4 == '分' :  # 共*分,共*题, 每题*.*分/每题*分,共*题,共*.*分
+                                                                item_N = -1
+                                                                if int(N_s[0]) > int(N_s[2]):
+                                                                    item_total_score = int(N_s[0])
+                                                                    item_score = float(N_s[2] + '.' + N_s[3])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = int(N_s[2])
+                                                                    item_score = int(N_s[0])
+                                                                    item_count = int(item_total_score / item_score)
+                                                            elif num_back2== '.' and num_infer3== '.' and num_back3 == '分'and num_back4 == '分' :  # 共*题,共*.*分,每题*分/共*题,每题*.*分,共*分
+                                                                item_N = -1
+                                                                if int(N_s[1]) > int(N_s[3]):
+                                                                    item_total_score = int(N_s[1])
+                                                                    item_score = int(N_s[3])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = int(N_s[3])
+                                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                                    item_count = int(item_total_score / item_score)
+                                                            elif num_back3== '.' and num_infer4== '.' and num_back4 == '分'and num_back2 == '分' :  # 共*题,共*分,每题*.*分/共*题,每题*分,共*.*分
+                                                                item_N = -1
+                                                                if int(N_s[1]) > int(N_s[2]):
+                                                                    item_total_score = int(N_s[1])
+                                                                    item_score = float(N_s[2] + '.' + N_s[3])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = int(N_s[2])
+                                                                    item_score = int(N_s[1])
+                                                                    item_count = int(item_total_score / item_score)
+                                                            elif num_back1== '.' and num_infer2== '.' and num_back2 == '分'and num_back3 == '分' :  # 每题*.*分,共*分,共*题/共*.*分,每题*分,共*题
+                                                                item_N = -1
+                                                                if int(N_s[0]) > int(N_s[2]):
+                                                                    item_total_score = int(N_s[0])
+                                                                    item_score = int(N_s[2])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = int(N_s[2])
+                                                                    item_score = float(N_s[0] + '.' + N_s[1])
+                                                                    item_count = int(item_total_score / item_score)
+                                                            elif num_back2== '.' and num_infer3== '.' and num_back3 == '分'and num_back1 == '分' :  # 每题*分,共*.*分,共*题/共*分,每题*.*分,共*题
+                                                                item_N = -1
+                                                                if int(N_s[0]) > int(N_s[1]):
+                                                                    item_total_score = int(N_s[0])
+                                                                    item_score = float(N_s[1] + '.' + N_s[2])
+                                                                    item_count = int(item_total_score / item_score)
+                                                                else:
+                                                                    item_total_score = int(N_s[1])
+                                                                    item_score = int(N_s[0])
+                                                                    item_count = int(item_total_score / item_score)
+                                                            else:
+                                                                break
+                                                            if item_total_score < item_count * item_score:
+                                                                item_total_score = item_count * item_score
+                                                            type_score_dict_ocr['item_N'] = item_N
+                                                            type_score_dict_ocr['item_total_score'] = item_total_score
+                                                            type_score_dict_ocr['item_count'] = item_count
+                                                            type_score_dict_ocr['item_score'] = item_score
+                                                            Score_structure_item = type_score_dict_ocr
+                                                            Score_structure.append(Score_structure_item)
+                                                            all_structure = {'volume_structure': -1,
+                                                                             'Score_structure': Score_structure}
+                                                            break
+
+
+
+                                                    break
+                                            break
+                                    break
+                            elif x == len_keyword_item1 - 1:
+                                for y in range(len_keyword_item2):
+                                    if C_s.find(keyword_item2[y]) != -1:
+                                        if len(N_s) == 1:
+                                            num_index1 = s.index(N_s[0])
+                                            num_infer1 = s[num_index1 - len(N_s[0])]
+                                            num_back1 = s[num_index1 + len(N_s[0])]
+                                            if isinstance(N_s[0], str):
+                                                N_s[0] = int(N_s[0])
+                                            if num_back1 == '分':  # 每题*分
+                                                item_score = int(N_s[0])
+                                                type_score_dict_ocr['item_N'] = -1
+                                                type_score_dict_ocr['item_total_score'] = -1
+                                                type_score_dict_ocr['item_count'] = -1
+                                                type_score_dict_ocr['item_score'] = item_score
+                                                Score_structure_item = type_score_dict_ocr
+                                                Score_structure.append(Score_structure_item)
+                                                all_structure = {'volume_structure': -1,
+                                                                 'Score_structure': Score_structure}
+                                                break
+                                        elif len(N_s) == 2:
+                                            num_index1 = s.index(N_s[0])
+                                            num_infer1 = s[num_index1 - len(N_s[0])]
+                                            num_back1 = s[num_index1 + len(N_s[0])]
+                                            all_1 = find_repeat(s, N_s[1])
+                                            temp1 = 0
+                                            for ii in range(len(N_s[0])):
+                                                if N_s[0][ii] == N_s[1]:
+                                                    temp1 = temp1 + 1
+                                            num_index2 = all_1[temp1]
+                                            num_infer2 = s[num_index2 - len(N_s[1])]
+                                            num_back2 = s[num_index2 + len(N_s[1])]
+                                            if isinstance(N_s[0], str):
+                                                N_s[0] = int(N_s[0])
+                                            if isinstance(N_s[1], str):
+                                                N_s[1] = int(N_s[1])
+                                            for z in range(len(keyword_item3)):
+                                                if C_s.find(keyword_item3[z]) != -1:
+                                                    if num_back2 == '分':  # 共*题,每题*分
+                                                        item_total_score = int(N_s[0]) * int(N_s[1])
+                                                        item_count = int(N_s[0])
+                                                        item_score = int(N_s[1])
+                                                        type_score_dict_ocr['item_N'] = -1
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif num_back1 == '分':  # 每题*分,共*题
+                                                        item_total_score = int(N_s[0]) * int(N_s[1])
+                                                        item_count = int(N_s[1])
+                                                        item_score = int(N_s[0])
+                                                        type_score_dict_ocr['item_N'] = -1
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = item_count
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                elif z == len(keyword_item3) - 1:
+                                                    if num_back2 == '分' and num_infer2 == '.' and num_back1 == '.':  # *.*分
+                                                        item_N = -1
+                                                        item_score = float(N_s[0] + '.'+N_s[1])
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = -1
+                                                        type_score_dict_ocr['item_count'] = -1
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                                    elif num_back2 == '分':  # *,*分
+                                                        item_N = int(N_s[0])
+                                                        item_score = int(N_s[1])
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = -1
+                                                        type_score_dict_ocr['item_count'] = -1
+                                                        type_score_dict_ocr['item_score'] = item_score
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                            break
+                                        elif len(N_s) == 3:  # 9月16号修改
+                                            num_index1 = s.index(N_s[0])
+                                            num_infer1 = s[num_index1 - len(N_s[0])]
+                                            num_back1 = s[num_index1 + len(N_s[0])]
+                                            all_1 = find_repeat(s, N_s[1])
+                                            temp1 = 0
+                                            for ii in range(len(N_s[0])):
+                                                if N_s[0][ii] == N_s[1]:
+                                                    temp1 = temp1 + 1
+                                            num_index2 = all_1[temp1]
+                                            num_infer2 = s[num_index2 - len(N_s[1])]
+                                            num_back2 = s[num_index2 + len(N_s[1])]
+                                            all_2 = find_repeat(s, N_s[2])
+                                            temp2 = 0
+                                            for ii in range(len(N_s[0])):
+                                                if N_s[0][ii] == N_s[2]:
+                                                    temp2 = temp2 + 1
+                                            for jj in range(len(N_s[1])):
+                                                if N_s[1][jj] == N_s[2]:
+                                                    temp2 = temp2 + 1
+                                            num_index3 = all_2[temp2]
+                                            num_infer3 = s[num_index3 - len(N_s[2])]
+                                            num_back3 = s[num_index3 + len(N_s[2])]
+                                            if isinstance(N_s[0], str):
+                                                N_s[0] = int(N_s[0])
+                                            if isinstance(N_s[1], str):
+                                                N_s[1] = int(N_s[1])
+                                            if isinstance(N_s[2], str):
+                                                N_s[2] = int(N_s[2])
+
+                                            if num_back3 == '分' and num_back2 != '分' and num_infer3 != '.':  # *,共*题,每题*分
+                                                item_N = int(N_s[0])
+                                                item_total_score = int(N_s[1]) * int(N_s[2])
+                                                item_count = int(N_s[1])
+                                                item_score = int(N_s[2])
+                                            elif num_back2 == '分' and num_back3 != '分' and num_infer2 != '.':  # *,每题*分,共*题
+                                                item_N = int(N_s[0])
+                                                item_total_score = int(N_s[1]) * int(N_s[2])
+                                                item_count = int(N_s[2])
+                                                item_score = int(N_s[1])
+                                                type_score_dict_ocr['item_N'] = item_N
+                                                type_score_dict_ocr['item_total_score'] = item_total_score
+                                                type_score_dict_ocr['item_count'] = item_count
+                                                type_score_dict_ocr['item_score'] = item_score
+                                                Score_structure_item = type_score_dict_ocr
+                                                Score_structure.append(Score_structure_item)
+                                                all_structure = {'volume_structure': -1,
+                                                                 'Score_structure': Score_structure}
+                                            elif num_back2 == '分' and num_back3 == '分'and num_infer3 != '.' and num_infer2 != '.':  # *,*分,每题*分
+                                                item_N = int(N_s[0])
+                                                item_total_score = int(N_s[1])
+                                                item_count = int(N_s[1]) / int(N_s[2])
+                                                item_score = int(N_s[2])
+                                                type_score_dict_ocr['item_N'] = item_N
+                                                type_score_dict_ocr['item_total_score'] = item_total_score
+                                                type_score_dict_ocr['item_count'] = item_count
+                                                type_score_dict_ocr['item_score'] = item_score
+                                                Score_structure_item = type_score_dict_ocr
+                                                Score_structure.append(Score_structure_item)
+                                                all_structure = {'volume_structure': -1,
+                                                                 'Score_structure': Score_structure}
+                                            elif num_back3 == '分' and num_back2 == '.' and num_infer3 == '.' and (num_back1 =='题' or num_back1 =='小'or num_back1 =='空'):  # 共*题,每题*.*分
+                                                item_N = -1
+                                                item_count = int(N_s[0])
+                                                item_score = float(N_s[1]+'.'+N_s[2])
+                                                item_total_score = int(item_score * item_count)
+                                                type_score_dict_ocr['item_N'] = item_N
+                                                type_score_dict_ocr['item_total_score'] = item_total_score
+                                                type_score_dict_ocr['item_count'] = item_count
+                                                type_score_dict_ocr['item_score'] = item_score
+                                                Score_structure_item = type_score_dict_ocr
+                                                Score_structure.append(Score_structure_item)
+                                                all_structure = {'volume_structure': -1,
+                                                                 'Score_structure': Score_structure}
+                                            elif num_back2 == '分' and num_back1 == '.' and num_infer2 == '.'and (num_back3 =='题' or num_back3 =='小'or num_back3 =='空'):  # 每题*.*分,共*题
+                                                item_N = -1
+                                                item_count = int(N_s[2])
+                                                item_score = float(N_s[0] + '.' + N_s[1])
+                                                item_total_score = int(item_score * item_count)
+                                                type_score_dict_ocr['item_N'] = item_N
+                                                type_score_dict_ocr['item_total_score'] = item_total_score
+                                                type_score_dict_ocr['item_count'] = item_count
+                                                type_score_dict_ocr['item_score'] = item_score
+                                                Score_structure_item = type_score_dict_ocr
+                                                Score_structure.append(Score_structure_item)
+                                                all_structure = {'volume_structure': -1,
+                                                                 'Score_structure': Score_structure}
+                                            elif num_back3 == '分' and num_back2 == '.' and num_infer3 == '.':  # *,每题*.*分
+                                                item_N = N_s[0]
+                                                item_count = -1
+                                                item_score = float(N_s[1] + '.' + N_s[2])
+                                                item_total_score = -1
+                                                type_score_dict_ocr['item_N'] = item_N
+                                                type_score_dict_ocr['item_total_score'] = item_total_score
+                                                type_score_dict_ocr['item_count'] = item_count
+                                                type_score_dict_ocr['item_score'] = item_score
+                                                Score_structure_item = type_score_dict_ocr
+                                                Score_structure.append(Score_structure_item)
+                                                all_structure = {'volume_structure': -1,
+                                                                 'Score_structure': Score_structure}
+                                        elif len(N_s) == 4:  # 9月16号修改
+                                            num_index1 = s.index(N_s[0])
+                                            num_infer1 = s[num_index1 - len(N_s[0])]
+                                            num_back1 = s[num_index1 + len(N_s[0])]
+                                            all_1 = find_repeat(s, N_s[1])
+                                            temp1 = 0
+                                            for ii in range(len(N_s[0])):
+                                                if N_s[0][ii] == N_s[1]:
+                                                    temp1 = temp1 + 1
+                                            num_index2 = all_1[temp1]
+                                            num_infer2 = s[num_index2 - len(N_s[1])]
+                                            num_back2 = s[num_index2 + len(N_s[1])]
+                                            all_2 = find_repeat(s, N_s[2])
+                                            temp2 = 0
+                                            for ii in range(len(N_s[0])):
+                                                if N_s[0][ii] == N_s[2]:
+                                                    temp2 = temp2 + 1
+                                            for jj in range(len(N_s[1])):
+                                                if N_s[1][jj] == N_s[2]:
+                                                    temp2 = temp2 + 1
+                                            num_index3 = all_2[temp2]
+                                            num_infer3 = s[num_index3 - len(N_s[2])]
+                                            num_back3 = s[num_index3 + len(N_s[2])]
+                                            all_3 = find_repeat(s, N_s[3])
+                                            temp3 = 0
+                                            for ii in range(len(N_s[0])):
+                                                if N_s[0][ii] == N_s[3]:
+                                                    temp3 = temp3 + 1
+                                            for jj in range(len(N_s[1])):
+                                                if N_s[1][jj] == N_s[2]:
+                                                    temp3 = temp3 + 1
+                                            num_index4 = all_3[temp3]
+                                            num_infer4 = s[num_index4 - len(N_s[3])]
+                                            num_back4 = s[num_index4 + len(N_s[3])]
+                                            if isinstance(N_s[0], str):
+                                                N_s[0] = int(N_s[0])
+                                            if isinstance(N_s[1], str):
+                                                N_s[1] = int(N_s[1])
+                                            if isinstance(N_s[2], str):
+                                                N_s[2] = int(N_s[2])
+                                            if isinstance(N_s[3], str):
+                                                N_s[3] = int(N_s[3])
+
+                                            if num_back4 == '分' and num_back2 != '分' and num_infer4 == '.'and num_back3 == '.':  # *,共*题,每题*.*分
+                                                item_N = int(N_s[0])
+                                                item_count = int(N_s[1])
+                                                item_score = float(N_s[2]+'.'+N_s[3])
+                                                item_total_score = int(item_score * item_count)
+                                                type_score_dict_ocr['item_N'] = item_N
+                                                type_score_dict_ocr['item_total_score'] = item_total_score
+                                                type_score_dict_ocr['item_count'] = item_count
+                                                type_score_dict_ocr['item_score'] = item_score
+                                                Score_structure_item = type_score_dict_ocr
+                                                Score_structure.append(Score_structure_item)
+                                                all_structure = {'volume_structure': -1,
+                                                                 'Score_structure': Score_structure}
+
+                                            elif num_back3 == '分' and num_back4 != '分' and num_infer3 == '.'and num_back2 == '.':  # *,每题*.*分,共*题
+                                                item_N = int(N_s[0])
+                                                item_count = int(N_s[3])
+                                                item_score = float(N_s[1] + '.' + N_s[2])
+                                                item_total_score = int(item_score * item_count)
+                                                type_score_dict_ocr['item_N'] = item_N
+                                                type_score_dict_ocr['item_total_score'] = item_total_score
+                                                type_score_dict_ocr['item_count'] = item_count
+                                                type_score_dict_ocr['item_score'] = item_score
+                                                Score_structure_item = type_score_dict_ocr
+                                                Score_structure.append(Score_structure_item)
+                                                all_structure = {'volume_structure': -1,
+                                                                 'Score_structure': Score_structure}
+                                        break
+                                    elif y == len_keyword_item2 - 1:
+                                        if C_s.find(keyword_item4[0]) != -1:
+                                            if len(N_s) == 2:  # *,*分
+                                                num_index1 = s.index(N_s[0])
+                                                num_infer1 = s[num_index1 - len(N_s[0])]
+                                                num_back1 = s[num_index1 + len(N_s[0])]
+                                                if num_infer1 == '( ' or num_back1 == ')' or num_infer1 == '( ' or num_back1 == ')':
+                                                    break
+                                                else:
+                                                    all_1 = find_repeat(s, N_s[1])
+                                                    temp1 = 0
+                                                    for ii in range(len(N_s[0])):
+                                                        if N_s[0][ii] == N_s[1]:
+                                                            temp1 = temp1 + 1
+                                                    num_index2 = all_1[temp1]
+                                                    num_infer2 = s[num_index2 - len(N_s[1])]
+                                                    num_back2 = s[num_index2 + len(N_s[1])]
+                                                    if isinstance(N_s[0], str):
+                                                        N_s[0] = int(N_s[0])
+                                                    if isinstance(N_s[1], str):
+                                                        N_s[1] = int(N_s[1])
+                                                    if int(N_s[0]) > 1000:
+                                                        item_N =0
+                                                        item_N1 = int(N_s[0][-4] + N_s[0][-3])
+                                                        item_N2 = int(N_s[0][-2] + N_s[0][-1])
+                                                        if item_N2 - item_N1 == 1:
+                                                            item_N = [0, 0]
+                                                            item_N = [item_N1, item_N2]
+                                                        elif item_N2 - item_N1 == 2:
+                                                            item_N = [0, 0, 0]
+                                                            item_N = [item_N1, item_N1 + 1, item_N2]
+                                                        elif item_N2 - item_N1 == 3:
+                                                            item_N = [0, 0, 0, 0]
+                                                            item_N = [item_N1, item_N1 + 1, item_N1 + 2, item_N2]
+                                                        type_score_dict_ocr['item_N'] = item_N
+
+                                                    else:
+                                                        item_N = int(N_s[0])
+                                                        item_total_score = int(N_s[1])
+                                                        type_score_dict_ocr['item_N'] = item_N
+                                                        type_score_dict_ocr['item_total_score'] = item_total_score
+                                                        type_score_dict_ocr['item_count'] = -1
+                                                        type_score_dict_ocr['item_score'] = -1
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                    break
+                                            elif len(N_s) == 3:  # *,*分
+                                                num_index1 = s.index(N_s[0])
+                                                num_infer1 = s[num_index1 - len(N_s[0])]
+                                                num_back1 = s[num_index1 + len(N_s[0])]
+                                                all_1 = find_repeat(s, N_s[1])
+                                                temp1 = 0
+                                                for ii in range(len(N_s[0])):
+                                                    if N_s[0][ii] == N_s[1]:
+                                                        temp1 = temp1 + 1
+                                                num_index2 = all_1[temp1]
+                                                num_infer2 = s[num_index2 - len(N_s[1])]
+                                                num_back2 = s[num_index2 + len(N_s[1])]
+                                                all_2 = find_repeat(s, N_s[2])
+                                                temp2 = 0
+                                                for ii in range(len(N_s[0])):
+                                                    if N_s[0][ii] == N_s[2]:
+                                                        temp2 = temp2 + 1
+                                                for jj in range(len(N_s[1])):
+                                                    if N_s[1][jj] == N_s[2]:
+                                                        temp2 = temp2 + 1
+                                                num_index3 = all_2[temp2]
+                                                num_infer3 = s[num_index3 - len(N_s[2])]
+
+                                                if num_index3 + len(N_s[2]) < len(s):
+                                                    num_back3 = s[num_index3 + len(N_s[2])]
+                                                else:
+                                                    num_back3 = []
+                                                if isinstance(N_s[0], str):
+                                                    N_s[0] = int(N_s[0])
+                                                if isinstance(N_s[1], str):
+                                                    N_s[1] = int(N_s[1])
+                                                if isinstance(N_s[2], str):
+                                                    N_s[2] = int(N_s[2])
+
+                                                if num_back2 == '分':
+                                                    item_N = N_s[0]
+                                                    item_total_score = int(N_s[1])
+                                                    type_score_dict_ocr['item_total_score'] = item_total_score
+                                                    item_total_score = int(N_s[2])
+                                                    type_score_dict_ocr['item_total_score'] = item_total_score
+                                                    type_score_dict_ocr['item_N'] = item_N
+                                                    type_score_dict_ocr['item_count'] = -1
+                                                    type_score_dict_ocr['item_score'] = -1
+                                                    Score_structure_item = type_score_dict_ocr
+                                                    Score_structure.append(Score_structure_item)
+                                                    all_structure = {'volume_structure': -1,
+                                                                     'Score_structure': Score_structure}
+                                                    break
+                                                elif num_back3 == '分':
+                                                    if int(N_s[1]) - int(N_s[0]) == 1:
+                                                        item_N = [0, 0]
+                                                        item_N = [int(N_s[0]), int(N_s[1])]
+                                                    elif int(N_s[1]) - int(N_s[0]) == 2:
+                                                        item_N = [0, 0, 0]
+                                                        item_N = [int(N_s[0]), int(N_s[0]) + 1, int(N_s[1])]
+                                                    elif int(N_s[1]) - int(N_s[0]) == 3:
+                                                        item_N = [0, 0, 0, 0]
+                                                        item_N = [int(N_s[0]), int(N_s[0]) + 1, int(N_s[0]) + 2,
+                                                                  int(N_s[1])]
+                                                    else:
+                                                        break
+                                                    item_total_score = int(N_s[2])
+                                                    type_score_dict_ocr['item_total_score'] = item_total_score
+                                                    type_score_dict_ocr['item_N'] = item_N
+                                                    type_score_dict_ocr['item_count'] = -1
+                                                    type_score_dict_ocr['item_score'] = -1
+                                                    Score_structure_item = type_score_dict_ocr
+                                                    Score_structure.append(Score_structure_item)
+                                                    all_structure = {'volume_structure': -1,
+                                                                     'Score_structure': Score_structure}
+                                                    break
+                                            elif len(N_s) == 1:
+                                                num_index1 = s.index(N_s[0])
+                                                num_infer1 = s[num_index1 - len(N_s[0])]
+                                                if num_index1 + len(N_s[0]) < len(s):
+                                                    num_back1 = s[num_index1 + len(N_s[0])]
+                                                    item_total_score = int(N_s[0])
+                                                    type_score_dict_ocr['item_N'] = -1
+                                                    type_score_dict_ocr['item_total_score'] = item_total_score
+                                                    type_score_dict_ocr['item_count'] = -1
+                                                    type_score_dict_ocr['item_score'] = -1
+                                                    if num_back1 == '分':  # *分
+                                                        Score_structure_item = type_score_dict_ocr
+                                                        Score_structure.append(Score_structure_item)
+                                                        all_structure = {'volume_structure': -1,
+                                                                         'Score_structure': Score_structure}
+                                                        break
+                                break
+
+    return all_structure

+ 218 - 0
segment/sheet_resolve/analysis/sheet/ocr_sheet.py

@@ -0,0 +1,218 @@
+# @Author  : lightXu
+# @File    : ocr_sheet.py
+import re
+import numpy as np
+import xml.etree.cElementTree as ET
+from segment.sheet_resolve.tools.utils import create_xml
+from segment.sheet_resolve.analysis.sheet.sheet_adjust import adjust_item_edge_by_gray_image
+
+
+def subfield_answer_sheet(img0, answer_sheet):
+    h, w = img0.shape[:2]
+    one_part = 0
+    line_xmax_1 = 0
+    line_xmax_2 = 0
+    modules = []
+    modules11 = []
+    w_int_1 = w
+    w_int_2 = round(w / 2)
+    w_int_3 = round(w / 3)
+    w_int_4 = round(w / 4)
+    w_int_8 = round(w / 8)
+    if w_int_8 < 50:
+        w_int_8 = 50
+
+    key_modules_classes = ['choice', 'cloze', 'solve', 'solve0', 'composition0', 'composition', 'correction',
+                           'ban_area', ]
+    if h > w:  # 暂定答题卡高大于宽的为单栏
+        one_part = 1
+    else:
+        temp1 = 0
+        temp2 = 0
+        for ele in answer_sheet:
+            if ele["class_name"] in key_modules_classes:
+                modules.append(ele)
+        modules_xmin = sorted(modules, key=lambda x: (x['bounding_box']['xmin']))
+        modules_xmax = sorted(modules, key=lambda x: (x['bounding_box']['xmax']))
+        for i in range(len(modules_xmin) - 1):
+            if i == 0 and modules_xmin[0]['bounding_box']['xmin'] - 0 > w_int_4:
+                temp1 = 1
+            else:
+                if modules_xmin[i + 1]['bounding_box']['xmin'] - modules_xmin[i]['bounding_box']['xmax'] > w_int_4:
+                    if modules11 == []:
+                        line_xmax_1 = modules_xmin[i]['bounding_box']['xmax'] + 20
+                        line_xmax_2 = modules_xmin[i + 1]['bounding_box']['xmin'] - 20
+                    else:
+                        modules11.append(modules_xmin[i]['bounding_box']['xmax'])
+                        modules11_xmax = sorted(modules11)[-1]
+                        line_xmax_1 = modules11_xmax + 20
+                        line_xmax_2 = modules_xmin[i + 1]['bounding_box']['xmin'] - 20
+                        modules11 = []
+                    temp1 = 1
+                    temp2 = 1
+                    break
+                elif modules_xmin[i + 1]['bounding_box']['xmin'] - modules_xmin[i]['bounding_box']['xmax'] > -w_int_8:
+                    if temp1 == 0:
+                        if modules11 == []:
+                            line_xmax_1 = int((modules_xmin[i + 1]['bounding_box']['xmin'] +
+                                               modules_xmin[i]['bounding_box']['xmax']) / 2)
+                        else:
+                            modules11.append(modules_xmin[i]['bounding_box']['xmax'])
+                            modules11_xmax = sorted(modules11)[-1]
+                            line_xmax_1 = int((modules_xmin[i + 1]['bounding_box']['xmin'] +
+                                               modules11_xmax) / 2)
+                            modules11 = []
+                        temp1 = 1
+                    elif temp1 == 1:
+                        if modules11 == []:
+                            line_xmax_2 = int((modules_xmin[i + 1]['bounding_box']['xmin'] +
+                                               modules_xmin[i]['bounding_box']['xmax']) / 2)
+                        else:
+                            modules11.append(modules_xmin[i]['bounding_box']['xmax'])
+                            modules11_xmax = sorted(modules11)[-1]
+                            line_xmax_2 = int((modules_xmin[i + 1]['bounding_box']['xmin'] +
+                                               modules11_xmax) / 2)
+                        temp2 = 1
+                else:
+                    modules11.append(modules_xmin[i]['bounding_box']['xmax'])
+
+        if temp1 == 0 and temp2 == 0:
+            if modules_xmax[-1]['bounding_box']['xmax'] - w < -(2 * w_int_4):
+                line_xmax_1 = modules_xmax[-1]['bounding_box']['xmax'] + 20
+                line_xmax_2 = 2 * w_int_3
+            elif modules_xmax[-1]['bounding_box']['xmax'] - w < -w_int_4:
+                line_xmax_1 = modules_xmax[-1]['bounding_box']['xmax'] + 20
+        elif temp1 == 1 and temp2 == 0:
+            if modules_xmax[-1]['bounding_box']['xmax'] - w < -w_int_4:
+                line_xmax_2 = 2 * w_int_3
+
+    return line_xmax_1, line_xmax_2
+
+
+def tell_columns(image, sheet_dict):
+    h, w = image.shape[0], image.shape[1]
+    # sheet_dict = adjust_item_edge_by_gray_image(image, sheet_dict)
+    x1, x2 = subfield_answer_sheet(image, sheet_dict)
+
+    split_x = [px for px in [x1, x2] if px != 0]
+
+    if not split_x:
+        split_x = [w-1]
+
+    return split_x
+
+
+def ocr2sheet(image, sheet_dict, raw_ocr, xml_path=None):
+    col_split_list = tell_columns(image, sheet_dict)
+    digital_p = r'\d'
+    eng_char_p = '[\u0041-\u005a|\u0061-\u007a]'  # english
+    chn_char_p = '[\u4e00-\u9fa5]'  # chinese
+    sp_nums_p = '[①②③④⑤⑥⑦⑧⑨⑩]'
+    punctuation_p = '[,;:。,;:·√()()]+'
+
+    pattern_list = [chn_char_p]
+
+    ocr_res_len = len(raw_ocr)
+    for i, words_line in enumerate(raw_ocr):
+        words = words_line['words']
+        words = words.replace(' ', '').upper()  # 去除空格
+        loc = words_line['location']
+        top = int(loc['top'])
+        left = int(loc['left'])
+        width = int(loc['width'])
+        height = int(loc['height'])
+        loc.update({'right': left + width, 'bottom': top + height,
+                    'mid_x': left + width // 2, 'mid_y': top + height // 2})
+
+    raw_ocr = sorted(raw_ocr, key=lambda x:x['location']['mid_x'])
+    mid_x_list = [ele['location']['mid_x'] for ele in raw_ocr]
+    col_list = []
+    for split in col_split_list:
+        mid_x_list.append(split)
+        mid_x_list = sorted(mid_x_list)
+        split_index = mid_x_list.index(split)
+        col_list.append(raw_ocr[:split_index])
+        raw_ocr = raw_ocr[split_index:]
+        mid_x_list = mid_x_list[split_index+1:]
+
+    if raw_ocr:
+        col_list.append(raw_ocr)
+
+    block_list = []
+    for ocr_res in col_list:
+        ocr_res = sorted(ocr_res, key=lambda x: x['location']['top'])
+        raw_chn_index = []
+        for i, words_line in enumerate(ocr_res):
+            words = words_line['words']
+            loc = words_line['location']
+            width = int(loc['width'])
+            height = int(loc['height'])
+            if width >= height:
+                match_nums_list = []
+                for p in pattern_list:
+                    words_m = re.finditer(p, words)
+                    match_index_list = [(m.group(), m.span()) for m in words_m if m]
+                    match_nums = len(match_index_list) * 2
+                    match_nums_list.append(match_nums)
+
+                if sum(match_nums_list) >= 2:
+                    raw_chn_index.append(i)
+
+        # print(raw_chn_index)
+
+        left_limit = min([ele['location']['left'] for ele in ocr_res
+                          if ele['location']['width'] >= ele['location']['height']]) - 10
+        right_limit = max([ele['location']['right'] for ele in ocr_res
+                           if ele['location']['width'] >= ele['location']['height']]) + 10
+        chn_index = raw_chn_index.copy()
+        if ocr_res_len - 1 not in raw_chn_index:
+            chn_index.append(len(ocr_res) - 1)
+        split_index_arr = np.array(chn_index)
+
+        numbers_interval = np.abs(split_index_arr[1:] - split_index_arr[:-1])
+
+        split_index = []
+        for i, interval in enumerate(numbers_interval):
+            if interval > np.mean(numbers_interval) and interval > 2:
+                split_index.append(i)
+
+        split_index = sorted(list(set(split_index)))
+        # print('split_index', split_index)
+
+        for i, ele in enumerate(split_index):
+            top_limit = raw_chn_index[ele]
+            if top_limit == len(ocr_res) - 1:
+                break
+            else:
+                # 下分界行的上一行
+                bottom_limit = chn_index[split_index[i] + 1]
+                if bottom_limit in raw_chn_index:
+                    while int(ocr_res[bottom_limit - 1]["location"]['height']) >= int(
+                            ocr_res[bottom_limit - 1]["location"]['width']):
+                        bottom_limit = bottom_limit - 1
+
+                    bottom = int(
+                        ocr_res[bottom_limit - 1]["location"]["top"] +
+                        ocr_res[bottom_limit - 1]["location"]["height"] * 1.2)
+                else:
+                    bottom_limit = chn_index[-1]
+                    bottom = int(
+                        ocr_res[bottom_limit]["location"]["top"] +
+                        ocr_res[bottom_limit]["location"]["height"] * 1.2)
+
+            # 上分界行的下一行
+            top = int(ocr_res[top_limit + 1]["location"]["top"] - 0.2 * ocr_res[top_limit + 1]["location"]["height"])
+
+            left = left_limit
+            right = right_limit
+
+            block_list.append({'loc': [left, top, right, bottom]})
+
+    # tree = ET.parse(xml_path)
+    # for index, choice_m in enumerate(block_list):
+    #     xmin, ymin, xmax, ymax = choice_m["loc"]
+    #     tree = create_xml(f'block_{index}', tree, str(xmin), str(ymin), str(xmax), str(ymax))
+    #
+    # tree.write(xml_path)
+
+    return block_list

+ 485 - 0
segment/sheet_resolve/analysis/sheet/sheet_adjust.py

@@ -0,0 +1,485 @@
+# @Author  : mbq
+# @File    : sheet_adjust.py
+# @Time    : 2019/9/26 0026 上午 10:12
+import copy
+import json
+import os
+
+import cv2
+import numpy as np
+
+''' 根据CV检测矩形框 调整模型输出框'''
+''' LSD直线检测 暂时改用 霍夫曼检测'''
+
+
+# 用户自己计算阈值
+def custom_threshold(gray, type_inv=cv2.THRESH_BINARY):
+    # gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)  #把输入图像灰度化
+    h, w = gray.shape[:2]
+    m = np.reshape(gray, [1, w * h])
+    mean = m.sum() / (w * h)
+    ret, binary = cv2.threshold(gray, min(230, mean), 255, type_inv)
+    return binary
+
+
+# 开运算
+def open_img(image_bin, kera=(5, 5)):
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, kera)
+    opening = cv2.morphologyEx(image_bin, cv2.MORPH_OPEN, kernel)
+    return opening
+
+
+# 闭运算
+def close_img(image_bin, kera=(5, 5)):
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, kera)
+    closing = cv2.morphologyEx(image_bin, cv2.MORPH_CLOSE, kernel)
+    return closing
+
+
+# 腐蚀
+def erode_img(image, kernel_size):
+    kernel = np.ones((kernel_size, kernel_size), np.uint8)
+    erosion = cv2.erode(image, kernel)
+    return erosion
+
+
+# 膨胀
+def dilation_img(image, kernel_size):
+    kernel = np.ones((kernel_size, kernel_size), np.uint8)
+    dilaion = cv2.dilate(image, kernel)
+    return dilaion
+
+
+# 图像padding
+def image_padding(image, padding_w, padding_h):
+    h, w = image.shape[:2]
+    if (3 == len(image.shape)):
+        image_new = np.zeros((h + padding_h, w + padding_w, 3), np.uint8)
+    else:
+        image_new = np.zeros((h + padding_h, w + padding_w), np.uint8)
+    image_new[int(padding_h / 2):int(padding_h / 2) + h, int(padding_w / 2):int(padding_w / 2) + w] = image
+    return image_new
+
+
+def horizontal_projection(img_bin, mut=0):
+    '''水平方向投影'''
+    h, w = img_bin.shape[:2]
+    hist = [0 for i in range(w)]
+    for x in range(w):
+        tmp = 0
+        for y in range(h):
+            if img_bin[y][x]:
+                tmp += 1
+        if tmp > mut:
+            hist[x] = tmp
+    return hist
+
+
+def vertical_projection(img_bin, mut=0):
+    """垂直方向投影"""
+    h, w = img_bin.shape[:2]
+    hist = [0 for i in range(h)]
+    for y in range(h):
+        tmp = 0
+        for x in range(w):
+            if img_bin[y][x]:
+                tmp += 1
+        if tmp > mut:
+            hist[y] = tmp
+    return hist
+
+
+def get_white_blok_pos(arry, blok_w=0):
+    '''获取投影结果中的白色块'''
+    pos = []
+    start = 1
+    x0 = 0
+    x1 = 0
+    for idx, val in enumerate(arry):
+        if (start):
+            if val:
+                x0 = idx
+                start = 0
+        else:
+            if (0 == val):
+                x1 = idx
+                start = 1
+                if (x1 - x0 > blok_w):
+                    pos.append((x0, x1))
+    if (0 == start):
+        x1 = len(arry) - 1
+        if (x1 - x0 > blok_w):
+            pos.append((x0, x1))
+    return pos
+
+
+def get_decide_boberLpa(itemRe, itemGT):
+    '''
+    IOU 计算
+    '''
+    x1 = int(itemRe[0])
+    y1 = int(itemRe[1])
+    x1_ = int(itemRe[2])
+    y1_ = int(itemRe[3])
+    width1 = x1_ - x1
+    height1 = y1_ - y1
+
+    x2 = int(float(itemGT[0]))
+    y2 = int(float(itemGT[1]))
+    x2_ = int(float(itemGT[2]))
+    y2_ = int(float(itemGT[3]))
+    width2 = x2_ - x2
+    height2 = y2_ - y2
+
+    endx = max(x1_, x2_)
+    startx = min(x1, x2)
+    width = width1 + width2 - (endx - startx)
+
+    endy = max(y1_, y2_)
+    starty = min(y1, y2)
+    height = height1 + height2 - (endy - starty)
+
+    AreaJc = 0
+    ratio = 0.0
+
+    if width <= 0 or height <= 0:
+        res = 0
+    else:
+        AreaJc = width * height
+        AreaRe = width1 * height1
+        AreaGT = width2 * height2
+        ratio = float(AreaJc) / float((AreaGT + AreaRe - AreaJc))
+    return ratio
+
+
+# 查找连通区域 微调专用 不通用
+def get_contours(image):
+    # image = cv2.imread(img_path,0)
+    # if debug: plt_imshow(image)
+    image_binary = custom_threshold(image)
+    # if debug: plt_imshow(image_binary)
+    # if debug: cv2.imwrite(os.path.join(file_dir,"bin.jpg"),image_binary)
+    image_dilation = open_img(image_binary, kera=(5, 1))
+    image_dilation = open_img(image_dilation, kera=(1, 5))
+    # if debug: plt_imshow(image_dilation)
+    # if debug: cv2.imwrite(os.path.join(file_dir,"dia.jpg"),image_dilation)
+    _, labels, stats, centers = cv2.connectedComponentsWithStats(image_dilation)
+    rects = []
+    img_h, img_w = image.shape[:2]
+    for box in stats:
+        x0 = int(box[0])
+        y0 = int(box[1])
+        w = int(box[2])
+        h = int(box[3])
+        area = int(box[4])
+        if (w < img_w / 5 or w > img_w - 10 or h < 50 or h > img_h - 10):  # 常见框大小限定
+            continue
+        if (img_w > img_h):  # 多栏答题卡 w大于宽度的一般肯定是错误的框
+            if (w > img_w / 2):
+                continue
+        if (area < w * h / 3):  # 大框套小框 中空白色区域形成的面积 排除
+            continue
+        rects.append((x0, y0, x0 + w, y0 + h))
+    return rects
+
+
+def adjust_alarm_info(image, box):
+    '''
+    调整上下坐标 排除内部含有了边框线情况
+    左右调整只有100%确认的 从边界开始遇到的第一个非0列就终止 误伤情况太多
+    LSD算法转不过来  霍夫曼检测不靠谱 连通区域测试后排除误伤情况太多  改用投影
+    image: 灰度 非 二值图
+    box  : 坐标信息
+    '''
+    # debug
+    # debug = 0
+
+    if (image is None):
+        print("error image")
+        return box
+    img_box = image[box[1]:box[3], box[0]:box[2]]
+    h, w = img_box.shape[:2]
+
+    # debug
+    # if debug: ia.imshow(img_box)
+
+    img_bin = custom_threshold(img_box, type_inv=cv2.THRESH_BINARY_INV)
+    img_padding = image_padding(img_bin, 100, 100)
+    img_close = close_img(img_padding, kera=(30, 3))
+    img_back = img_close[50:50 + h, 50:50 + w]
+
+    # debug
+    # if debug: ia.imshow(img_back)
+
+    # 垂直投影 找 left top
+    hist_vert = vertical_projection(img_back, mut=h / 4)
+
+    # debug
+    # if debug:
+    #     print(hist_vert)
+    #     black_img_h = np.zeros_like(img_back)
+    #     for idx, val in enumerate(hist_vert):
+    #         if (val == 0):
+    #             continue
+    #         for x in range(val):
+    #             black_img_h[idx][x] = 255
+    #     ia.imshow(black_img_h)
+
+    y_pos = get_white_blok_pos(hist_vert, 2)
+    if (len(y_pos) == 0):
+        return box
+
+    # 获取最大的作为alarm_info的区域
+    max_id = 0
+    max_len = 0
+    for idx, pos_tmp in enumerate(y_pos):
+        pos_len = abs(pos_tmp[1] - pos_tmp[0])
+        if (pos_len > max_len):
+            max_id = idx
+            max_len = pos_len
+
+    # debug to show
+    # if debug:
+    #     img_show = cv2.cvtColor(img_box, cv2.COLOR_GRAY2BGR)
+    #     cv2.line(img_show, (0, y_pos[max_id][0]), (w - 1, y_pos[max_id][0]), (0, 0, 255), 2)
+    #     cv2.line(img_show, (0, y_pos[max_id][1]), (w - 1, y_pos[max_id][1]), (0, 0, 255), 2)
+    #     ia.imshow(img_show)
+
+    # 左右 的微调
+    img_next = img_bin[y_pos[max_id][0]:y_pos[max_id][1], 0:w - 1]
+    img_lr_close = open_img(img_next, kera=(1, 1))
+    img_lr_close = close_img(img_lr_close, kera=(3, 1))
+
+    # debug
+    # if debug: ia.imshow(img_lr_close)
+
+    hist_proj = horizontal_projection(img_lr_close, mut=1)
+    w_len = len(hist_proj)
+    new_left = 0
+    new_right = w_len - 1
+    b_flag = [0, 0]
+    for idx, val in enumerate(hist_proj):
+        if (0 == b_flag[0]):
+            if (val != 0):
+                new_left = idx
+                b_flag[0] = 1
+        if (0 == b_flag[1]):
+            if (hist_proj[w_len - 1 - idx] != 0):
+                new_right = w_len - idx - 1
+                b_flag[1] = 1
+        if (b_flag[0] and b_flag[1]):
+            break
+
+    new_top = box[1] + y_pos[max_id][0]
+    new_bottom = box[1] + y_pos[max_id][1]
+    new_left += box[0]
+    new_right += box[0]
+    box[1] = new_top
+    box[3] = new_bottom
+    box[0] = new_left
+    box[2] = new_right
+
+    return box
+
+
+def adjust_zg_info(image, box, cv_boxes):
+    '''
+    调整大区域的box
+    1、cvbox要与box纵坐标有交叉
+    2、IOU值大于0。8时 默认相等拷贝区域坐标
+    '''
+    if (image is None):
+        return box
+
+    min_rotio = 0.5
+    img_box = image[box[1]:box[3], box[0]:box[2]]
+    h, w = img_box.shape[:2]
+
+    jc_boxes = []  # 记录与box存在交叉的 cv_boxes
+    tmp_rotio = 0
+    rc_mz = box
+    for idx, cv_box in enumerate(cv_boxes):
+        if ((box[1] - 10) > (cv_box[3])):  # 首先要保证纵坐标有交叉
+            continue
+        if ((box[3] + 10) < cv_box[1]):
+            continue
+
+        jc_x = max(box[0], cv_box[0])
+        jc_y = min(box[2], cv_box[2])
+        bj_x = min(box[0], cv_box[0])
+        bj_y = max(box[2], cv_box[2])
+
+        rt = abs(jc_y - jc_x) * 1.0 / abs(bj_y - bj_x) * 1.0
+        if (rt < min_rotio):
+            continue
+        jc_boxes.append(cv_box)
+        if (rt > tmp_rotio):
+            rc_mz = cv_box
+            tmp_rotio = rt
+    # 判断 调整
+    if (len(jc_boxes) != 0):
+        box[0] = rc_mz[0]
+        box[2] = rc_mz[2]
+        b_find = 0
+        frotio = 0.0
+        rc_biggst = rc_mz
+        for mz_box in jc_boxes:
+            iou = get_decide_boberLpa(mz_box, box)
+            if (iou > 0.8):
+                b_find = 1
+                frotio = iou
+                rc_biggst = mz_box
+        if (b_find):
+            box[1] = rc_biggst[1]
+            box[3] = rc_biggst[3]
+    return box
+
+
+def adjust_item_edge(img_path, reback_json):
+    '''
+    根据图像的CV分析结果和 模型直接输出结果 对模型输出的边框做微调
+    1、外接矩形查找
+    2、LSD直线检测 替换方法 霍夫曼直线检测
+    3、只处理有把握的情况 任何含有不确定因素的一律不作任何处理
+    img_path: 待处理图像绝对路径
+    re_json : 模型输出结果
+    '''
+    debug = 1
+    # 存放新的结果
+    re_json = copy.deepcopy(reback_json)
+    if (not os.path.exists(img_path) or 0 == len(re_json)):
+        return
+    image = cv2.imread(img_path, 0)
+    # 获取CV连通区域结果
+    cv_boxes = get_contours(image)
+
+    if debug:
+        print(len(cv_boxes))
+        image_draw = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+        # for item in cv_boxes:
+        #     cv2.rectangle(image_draw, (item[0], item[1]), (item[2], item[3]), (0, 0, 250), 2)
+        # cv2.imwrite(os.path.join(file_dir, "show.jpg"), image_draw)
+    # 循环处理指定的box
+    for idx, item in enumerate(re_json):
+        name = item["class_name"]
+        box = [item["bounding_box"]["xmin"], item["bounding_box"]["ymin"], item["bounding_box"]["xmax"],
+               item["bounding_box"]["ymax"]]
+        # print(name ,box)
+        if (name == "alarm_info" or name == "page" or name == "type_score"):
+            if debug:
+                cv2.rectangle(image_draw, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
+            new_box = adjust_alarm_info(image, box)
+            if debug:
+                cv2.rectangle(image_draw, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 2)
+            item["bounding_box"]["xmin"] = box[0]
+            item["bounding_box"]["xmax"] = box[2]
+            item["bounding_box"]["ymin"] = box[1]
+            item["bounding_box"]["ymax"] = box[3]
+        elif (name == "solve" or name == "solve0"
+              or name == "cloze" or name == "choice"
+              or name == "composition" or name == "composition0"):
+            if debug:
+                cv2.rectangle(image_draw, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
+            new_box = adjust_zg_info(image, box, cv_boxes)
+            if debug:
+                cv2.rectangle(image_draw, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 2)
+            item["bounding_box"]["xmin"] = box[0]
+            item["bounding_box"]["xmax"] = box[2]
+            item["bounding_box"]["ymin"] = box[1]
+            item["bounding_box"]["ymax"] = box[3]
+        else:
+            pass
+    if debug:
+        cv2.imwrite(os.path.join(r"E:\data\aug_img\adjust", "show.jpg"), image_draw)
+    return re_json
+
+
+def adjust_item_edge_by_gray_image(image, reback_json):
+    '''
+    根据图像的CV分析结果和 模型直接输出结果 对模型输出的边框做微调
+    1、外接矩形查找
+    2、LSD直线检测 替换方法 霍夫曼直线检测
+    3、只处理有把握的情况 任何含有不确定因素的一律不作任何处理
+    img_path: 待处理图像绝对路径
+    re_json : 模型输出结果
+    '''
+    debug = 0
+    re_json = copy.deepcopy(reback_json)
+    # 存放新的结果
+    # 获取CV连通区域结果
+    if len(image.shape) > 2:
+        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+    cv_boxes = get_contours(image)
+
+    if debug:
+        print(len(cv_boxes))
+        image_draw = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+        # for item in cv_boxes:
+        #     cv2.rectangle(image_draw, (item[0], item[1]), (item[2], item[3]), (0, 0, 250), 2)
+        # cv2.imwrite(os.path.join(file_dir, "show.jpg"), image_draw)
+    # 循环处理指定的box
+    for idx, item in enumerate(re_json):
+        name = item["class_name"]
+        box = [item["bounding_box"]["xmin"], item["bounding_box"]["ymin"], item["bounding_box"]["xmax"],
+               item["bounding_box"]["ymax"]]
+        # print(name ,box)
+        if (name == "alarm_info" or name == "page" or name == "type_score"):
+            if debug:
+                cv2.rectangle(image_draw, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
+            new_box = adjust_alarm_info(image, box)
+            if debug:
+                cv2.rectangle(image_draw, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 2)
+            item["bounding_box"]["xmin"] = box[0]
+            item["bounding_box"]["xmax"] = box[2]
+            item["bounding_box"]["ymin"] = box[1]
+            item["bounding_box"]["ymax"] = box[3]
+        elif (name == "solve" or name == "solve0"
+              or name == "cloze" or name == "choice"
+              or name == "composition" or name == "composition0"):
+            if debug:
+                cv2.rectangle(image_draw, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
+            new_box = adjust_zg_info(image, box, cv_boxes)
+            if debug:
+                cv2.rectangle(image_draw, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 2)
+            item["bounding_box"]["xmin"] = box[0]
+            item["bounding_box"]["xmax"] = box[2]
+            item["bounding_box"]["ymin"] = box[1]
+            item["bounding_box"]["ymax"] = box[3]
+        else:
+            pass
+    if debug:
+        cv2.imwrite(os.path.join(r"E:\data\aug_img\adjust", "show.jpg"), image_draw)
+    return re_json
+
+# if __name__ == '__main__':
+#     '''服务端传入数据为json内数据 和图像
+#     使用方法:
+#     new_json = adjust_item_edge(img_path, key_json)
+#     key_json : regions 数组
+#     new_json : 调整后的结果 size == key_json.size
+#     '''
+#
+#     print("前置解析")
+#     file_dir = r"E:\data\aug_img\adjust"
+#     img_path = os.path.join(file_dir, "7642572.jpg")
+#     json_path = os.path.join(file_dir, "7642572.json")
+#     print(img_path, json_path)
+#     # 读取json
+#     output_ios = open(json_path).read()
+#     output_json = json.loads(output_ios)
+#     for item in output_json:
+#         # print(item,output_json[item])
+#         if (item == "regions"):
+#             key_json = output_json[item]
+#     # print(len(key_json))
+#     for idx, item in enumerate(key_json):
+#         # print(key_json[idx])
+#         if (item["class_name"] == "alarm_info"):
+#             key_json[idx]["bounding_box"]["ymin"] -= 10
+#             key_json[idx]["bounding_box"]["ymax"] += 10
+#         # print(key_json[idx])
+#
+#     new_json = adjust_item_edge(img_path, key_json)
+#     for idx, val in enumerate(key_json):
+#         print(key_json[idx])
+#         print(new_json[idx])

+ 1192 - 0
segment/sheet_resolve/analysis/sheet/sheet_infer.py

@@ -0,0 +1,1192 @@
+# @Author  : lightXu
+# @File    : sheet_infer.py
+# @Time    : 2019/9/26 0026 上午 10:18
+import itertools
+import os
+import re
+import traceback
+import xml.etree.cElementTree as ET
+from itertools import combinations
+
+import cv2
+import numpy as np
+from shapely.geometry import LineString, Polygon
+
+from segment.sheet_resolve.tools.utils import create_xml, crop_region_direct, crop_region, image_hash_detection_simple
+from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate
+
+ASPECT_FLAG = 4.0
+REMAIN_RATIO = 0.1
+PIX_VALUE_LOW = 15.0
+PIX_VALUE_HIGH = 245
+TYPE_SCORE_MNS = 0.5
+
+
+def _get_char_near_img(char_location, near):
+    left = char_location['left']
+    top = char_location['top']
+    width = char_location['width']
+    height = char_location['height']
+
+    next_location = char_location
+
+    if near == 'left':
+        next_location = {'left': int(left - 1.5 * width), 'top': top, 'width': width, 'height': height}
+    if near == 'right':
+        next_location = {'left': int(left + 1.5 * width), 'top': top, 'width': width, 'height': height}
+    if near == 'up':
+        next_location = {'left': left, 'top': int(top - 1.5 * height), 'width': width, 'height': height}
+    if near == 'down':
+        next_location = {'left': left, 'top': int(top + 1.5 * height), 'width': width, 'height': height}
+
+    return next_location
+
+
+def _get_board(image, location, direction):
+    std = 0
+    next_location = location
+    while std < 10:
+        next_location = _get_char_near_img(next_location, direction)
+        box = (next_location['left'], next_location['top'],
+               next_location['left'] + next_location['width'],
+               next_location['top'] + next_location['height'],)
+        region = crop_region_direct(image, box)
+        std = np.var(region)
+
+    return next_location
+
+
+def infer_bar_code(image, ocr_dict_list, attention_region):
+    attention_polygon_list = []
+    for attention in attention_region:
+        coordinates = attention['bounding_box']
+        xmin = coordinates['xmin']
+        ymin = coordinates['ymin']
+        xmax = coordinates['xmax']
+        ymax = coordinates['ymax']
+        attention_polygon = Polygon([(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)])
+        attention_polygon_list.append(attention_polygon)
+
+    img_cols, img_rows = image.shape[0], image.shape[1]
+    pattern = r'条形码|条码|条形|形码'
+    bar_code_dict_list = []
+
+    for index, ele in enumerate(ocr_dict_list):
+        words = ele['words'].replace(' ', '')
+        chars_list = ele['chars']
+        length = len(chars_list)
+        match_list = [(m.group(), m.span()) for m in re.finditer(pattern, words) if m]
+        if match_list:  # 不为空
+            for match in match_list:
+                start_index = match[1][0]
+                end_index = match[1][1] - 1
+                for i in range(start_index - 1, -1, -1):
+                    xmin_start = chars_list[start_index]['location']['left']
+                    start_tmp = chars_list[i]['location']['left'] + 2 * chars_list[i]['location']['width']
+                    if xmin_start <= start_tmp:
+                        start_index = i
+
+                for i in range(end_index, length):
+                    xmax_end = chars_list[end_index]['location']['left'] + 2 * chars_list[i]['location']['width']
+                    end_tmp = chars_list[i]['location']['left']
+                    if xmax_end >= end_tmp:
+                        end_index = i
+
+                bar_code_char_xmin = chars_list[start_index]['location']["left"]
+                bar_code_char_xmax = chars_list[end_index]['location']["left"]+chars_list[end_index]['location']["width"]
+                bar_code_char_ymin = chars_list[start_index]['location']["top"]
+                bar_code_char_ymax = chars_list[end_index]['location']["top"]+chars_list[end_index]['location']["height"]
+                bar_code_char_polygon = Polygon([(bar_code_char_xmin, bar_code_char_ymin),
+                                                 (bar_code_char_xmax, bar_code_char_ymin),
+                                                 (bar_code_char_xmax, bar_code_char_ymax),
+                                                 (bar_code_char_xmin, bar_code_char_ymax)])
+
+                contain_cond = [False]*len(attention_polygon_list)
+                for i, attention_ele in enumerate(attention_polygon_list):
+                    if attention_ele.contains(bar_code_char_polygon):
+                        contain_cond[i] = True
+
+                if True not in contain_cond:  # 条形码文字不在attention里面
+                    left_board_location = _get_board(image, chars_list[start_index]['location'], 'left')
+                    right_board_location = _get_board(image, chars_list[end_index]['location'], 'right')
+                    up_board_location = _get_board(image, chars_list[start_index]['location'], 'up')
+                    down_board_location = _get_board(image, chars_list[end_index]['location'], 'down')
+
+                    xmin = left_board_location['left']
+                    ymin = up_board_location['top']
+                    xmax = right_board_location['left'] + right_board_location['width']
+                    ymax = down_board_location['top'] + down_board_location['height']
+
+                    xmin = int(xmin) if xmin >= 1 else 1
+                    ymin = int(ymin) if ymin >= 1 else 1
+                    xmax = int(xmax) if xmax <= img_cols - 1 else img_cols - 1
+                    ymax = int(ymax) if ymax <= img_rows - 1 else img_rows - 1
+
+                    bar_code_dict = {'class_name': 'bar_code',
+                                     'bounding_box': {'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax}}
+                    bar_code_dict_list.append(bar_code_dict)
+                    # print(bar_code_dict)
+                    break  # 默认只有一个条形码
+                else:
+                    continue
+
+    # 过滤attention 区域存在条形码的文字
+    for bar_code in bar_code_dict_list.copy():
+        coordinates = bar_code['bounding_box']
+        xmin = coordinates['xmin']
+        ymin = coordinates['ymin']
+        xmax = coordinates['xmax']
+        ymax = coordinates['ymax']
+        bar_code_polygon = Polygon([(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)])
+        for attention_polygon in attention_polygon_list:
+            cond1 = bar_code_polygon.within(attention_polygon) or bar_code_polygon.contains(attention_polygon)
+            cond2 = False
+            cond3 = bar_code_polygon.overlaps(attention_polygon)
+            if cond3:
+                intersection_poly = bar_code_polygon.intersection(attention_polygon)
+                cond2 = intersection_poly.area / bar_code_polygon.area >= 0.01
+                cond3 = intersection_poly.area / attention_polygon.area >= 0.01
+            if cond1 or cond2 or cond3:
+                bar_code_dict_list.remove(bar_code)
+                break
+
+    return bar_code_dict_list
+
+
+def infer_exam_number(image, ocr_dict_list, existed_regions, times_threshold=5):
+    # existed_polygon_list = []
+    # for region in existed_regions:
+    #     coordinates = region['bounding_box']
+    #     xmin = coordinates['xmin']
+    #     ymin = coordinates['ymin']
+    #     xmax = coordinates['xmax']
+    #     ymax = coordinates['ymax']
+    #     existed_polygon = Polygon([(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)])
+    #     existed_polygon_list.append(existed_polygon)
+
+    img_rows, img_cols = image.shape[0], image.shape[1]
+    exam_number_dict_list = []
+    xmin, ymin, xmax, ymax = 9999, 9999, 0, 0
+    pattern = r'[0oO]|[2-9]'  # 除去1,避免[]被识别为1
+    exclude = r'分|题|[ABD]'
+    key_digital = []
+    all_height = []
+    cols = []
+    for index, ele in enumerate(ocr_dict_list):
+        words = ele['words'].replace(' ', '')
+        match_list = [(m.group(), m.span()) for m in re.finditer(pattern, words) if m]
+        exclude_list = [(m.group(), m.span()) for m in re.finditer(exclude, words, re.I) if m]
+        match_digital_arr = np.asarray([int(char[0].replace('o', '0').replace('O', '0')) for char in match_list])
+
+        if len(match_digital_arr) > 0:
+            counts = np.bincount(match_digital_arr)
+            mode_times = np.max(counts)
+            if mode_times >= times_threshold and len(exclude_list) < 1:
+                mode_value = np.argmax(counts)  # 众数,避免考号末尾出现的其他数字
+                key_index = np.where(match_digital_arr == mode_value)[0]
+                cols.append(len(key_index))
+                start_index = match_list[key_index[0]][1][0]
+                end_index = match_list[key_index[-1]][1][0]
+
+                xmin_t = ele['chars'][start_index]['location']['left']
+                ymin_t = ele['chars'][start_index]['location']['top']
+                xmax_t = ele['chars'][end_index]['location']['left'] + ele['chars'][end_index]['location']['width']
+                ymax_t = ele['chars'][end_index]['location']['top'] + ele['chars'][end_index]['location']['height']
+
+                mean_width = sum([int(ele['chars'][match_list[i][1][0]]['location']['width'])
+                                  for i in key_index]) // len(key_index)
+
+                mean_height = sum([int(ele['chars'][match_list[i][1][0]]['location']['height'])
+                                  for i in key_index]) // len(key_index)
+
+                all_height.append(mean_height)
+
+                xmin = min(xmin, xmin_t-mean_width)
+                ymin = min(ymin, ymin_t)
+                xmax = max(xmax, xmax_t+mean_width)
+                ymax = max(ymax, ymax_t)
+
+                xmin = int(xmin) if xmin >= 1 else 1
+                ymin = int(ymin) if ymin >= 1 else 1
+                xmax = int(xmax) if xmax <= img_cols - 1 else img_cols - 1
+                ymax = int(ymax) if ymax <= img_rows - 1 else img_rows - 1
+
+                key_digital.append(mode_value)
+                if 9 in key_digital:
+                    break
+
+    if 0 in key_digital and 9 in key_digital:
+        mean_height = sum(all_height)//10
+        exam_number_dict = {'class_name': 'exam_number',
+                            'bounding_box': {'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax+mean_height},
+                            'rows': 10,
+                            'cols': max(cols)
+                            }
+        exam_number_dict_list.append(exam_number_dict)
+
+        return exam_number_dict_list
+    else:
+        if len(key_digital) > 1:
+            dgt_min = min(key_digital)
+            dgt_max = max(key_digital)
+            mean_height = sum(all_height)//len(all_height)
+            dif = dgt_max - dgt_min
+            blank_height = ymax - ymin - mean_height * (dif+1)
+            mean_blank = blank_height // dif
+
+            upper_height = dgt_min * (mean_blank + mean_height) + mean_blank//2
+            downward_height = (9-dgt_max) * (mean_blank + mean_height) + mean_blank
+            exam_number_dict = {'class_name': 'exam_number',
+                                'bounding_box': {'xmin': xmin, 'ymin': ymin-upper_height,
+                                                 'xmax': xmax, 'ymax': ymax+downward_height},
+                                'rows': 10,
+                                'cols': max(cols)}
+            exam_number_dict_list.append(exam_number_dict)
+
+        if len(key_digital) == 1:
+            dgt_min = dgt_max = min(key_digital)
+            eval_height = sum(all_height)//len(all_height) * 1.5
+
+            upper_height = dgt_min * eval_height
+            downward_height = (9-dgt_max) * eval_height
+            exam_number_dict = {'class_name': 'exam_number',
+                                'bounding_box': {'xmin': xmin, 'ymin': ymin-upper_height,
+                                                 'xmax': xmax, 'ymax': ymax+downward_height},
+                                'rows': 10,
+                                'cols': max(cols)}
+            exam_number_dict_list.append(exam_number_dict)
+
+        iou_cond = True
+        exam_number_dict_list_check = []
+        for exam_number_dict in exam_number_dict_list:
+            exam_number_polygon = Polygon([(exam_number_dict["xmin"], exam_number_dict["ymin"]),
+                                           (exam_number_dict["xmax"], exam_number_dict["ymin"]),
+                                           (exam_number_dict["xmax"], exam_number_dict["ymax"]),
+                                           (exam_number_dict["xmin"], exam_number_dict["ymax"])])
+            for region in existed_regions:
+                class_name = region["class_name"]
+
+                if class_name in ["attention", "solve", "choice", "choice_m", 'choice_s', "cloze", 'cloze_s',
+                                  'bar_code', 'qr_code', 'composition', 'solve0']:
+                    coordinates = region['bounding_box']
+                    xmin = coordinates['xmin']
+                    ymin = coordinates['ymin']
+                    xmax = coordinates['xmax']
+                    ymax = coordinates['ymax']
+                    existed_polygon = Polygon([(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)])
+                    overlab_area = existed_polygon.intersection(exam_number_polygon).area
+                    iou = overlab_area / (exam_number_polygon.area + existed_polygon.area - overlab_area)
+                    if iou > 0:
+                        iou_cond = False
+                        break
+            if iou_cond:
+                exam_number_dict_list_check.append(exam_number_polygon)
+
+        return exam_number_dict_list_check
+
+
+def adjust_exam_number(regions):
+    exam_number_w_regions = list()
+    exam_number_regions = list()
+    for i in range(len(regions) - 1, -1, -1):
+        region = regions[i]
+        if region['class_name'] == 'exam_number_w':
+            exam_number_w_regions.append(region)
+        if region['class_name'] == 'exam_number':
+            exam_number_regions.append(region)
+            regions.pop(i)
+
+    exam_number_region = exam_number_regions[0]
+    if len(exam_number_regions) > 1:
+        exam_number_regions = sorted(exam_number_regions, key=lambda x: x['bounding_box']['ymin'])
+        exam_number_region = exam_number_regions[0]
+
+    exam_number_w_index = 0
+    if len(exam_number_w_regions) > 1:
+        distance = [abs(int(ele['bounding_box']['ymax']) - int(exam_number_region['bounding_box']['ymin ']))
+                    for ele in exam_number_w_regions]
+        exam_number_w_index = distance.index(min(distance))
+    exam_number_w_region = exam_number_w_regions[exam_number_w_index]
+
+    standard = exam_number_w_region['bounding_box']
+    exam_number_region['bounding_box'].update({'xmin': standard['xmin'], 'xmax': standard['xmax']})
+    regions.append(exam_number_region)
+
+    return regions
+
+
+def exam_number_infer_by_s(image, regions):
+    exam_number_s_list = [ele for ele in regions if ele['class_name'] == 'exam_number_s'
+                          and (int(ele['bounding_box']['xmax'])-int(ele['bounding_box']['xmin']) <
+                               int(ele['bounding_box']['ymax'])-int(ele['bounding_box']['ymin']))]
+    # 找边界
+    exam_number_s_list = sorted(exam_number_s_list, key=lambda x: x['bounding_box']['xmin'])
+
+    left_limit = exam_number_s_list[0]['bounding_box']['xmin']
+    right_limit = exam_number_s_list[-1]['bounding_box']['xmax']
+
+    left_image = crop_region(image, exam_number_s_list[0]['bounding_box'])
+    right_image = crop_region(image, exam_number_s_list[-1]['bounding_box'])
+
+    mean_width = sum([int(ele['bounding_box']['xmax'])-int(ele['bounding_box']['xmin'])
+                      for ele in exam_number_s_list]) // len(exam_number_s_list)
+    top_limit = min([ele['bounding_box']['ymin'] for ele in exam_number_s_list])
+    bottom_limit = max([ele['bounding_box']['ymax'] for ele in exam_number_s_list])
+
+    left_infer = True
+    while left_infer:
+        infer_box_xmin = int(left_limit - 1.5*mean_width)
+        infer_box_xmax = int(left_limit - 0.5*mean_width)
+        infer_box_ymin = int(exam_number_s_list[0]['bounding_box']['ymin'])
+        infer_box_ymax = int(exam_number_s_list[0]['bounding_box']['ymax'])
+
+        infer_image = crop_region_direct(image, [infer_box_xmin, infer_box_ymin, infer_box_xmax, infer_box_ymax])
+
+        simi = image_hash_detection_simple(left_image, infer_image)
+        print('l:', simi)
+        if simi >= 0.85:
+            left_limit = infer_box_xmin
+        else:
+            left_infer = False
+
+    right_infer = True
+    while right_infer:
+        infer_box_xmin = int(right_limit + 0.5 * mean_width)
+        infer_box_xmax = int(right_limit + 1.5 * mean_width)
+        infer_box_ymin = int(exam_number_s_list[-1]['bounding_box']['ymin'])
+        infer_box_ymax = int(exam_number_s_list[-1]['bounding_box']['ymax'])
+
+        infer_image = crop_region_direct(image, [infer_box_xmin, infer_box_ymin, infer_box_xmax, infer_box_ymax])
+
+        simi = image_hash_detection_simple(right_image, infer_image)
+        print('r:', simi)
+        if simi >= 0.70:
+            right_limit = infer_box_xmax
+        else:
+            right_infer = False
+
+    infer_exam_number_region = {'xmin': left_limit, 'xmax': right_limit, 'ymin': top_limit, 'ymax': bottom_limit, }
+    exam_dict_list = [{'class_name': 'exam_number', 'bounding_box': infer_exam_number_region}]
+    # print(exam_dict_list)
+    return exam_dict_list
+
+
+def gen_xml_new(path, ocr_list):
+    tree = ET.parse(r'../../tools/000000-template.xml')  # xml tree
+    for index, ele in enumerate(ocr_list):
+        words = ele['words']
+        location = ele['location']
+        xmin = location['xmin']
+        ymin = location['ymin']
+        xmax = location['xmax']
+        ymax = location['ymax']
+
+        tree = create_xml('{}'.format(words), tree, str(xmin), str(ymin), str(xmax), str(ymax))
+        # print(exam_items_bbox)
+    tree.write(path.replace('.jpg', '.xml'))
+
+
+def subfield_answer_sheet(img0, answer_sheet):
+    h, w = img0.shape[:2]
+    one_part = 0
+    line_xmax_1 = 0
+    line_xmax_2 = 0
+    modules = []
+    modules11 = []
+    w_int_1 = w
+    w_int_2 = round(w / 2)
+    w_int_3 = round(w / 3)
+    w_int_4 = round(w / 4)
+    w_int_8 = round(w / 8)
+    if w_int_8 < 50:
+        w_int_8 = 50
+
+    key_modules_classes = ['choice', 'cloze', 'solve', 'solve0', 'composition0', 'composition', 'correction',
+                           'ban_area', ]
+    if h > w:  # 暂定答题卡高大于宽的为单栏
+        one_part = 1
+    else:
+        temp1 = 0
+        temp2 = 0
+        for ele in answer_sheet:
+            if ele["class_name"] in key_modules_classes:
+                modules.append(ele)
+        modules_xmin = sorted(modules, key=lambda x: (x['bounding_box']['xmin']))
+        modules_xmax = sorted(modules, key=lambda x: (x['bounding_box']['xmax']))
+        for i in range(len(modules_xmin) - 1):
+            if i == 0 and modules_xmin[0]['bounding_box']['xmin'] - 0 > w_int_4:
+                temp1 = 1
+            else:
+                if modules_xmin[i + 1]['bounding_box']['xmin'] - modules_xmin[i]['bounding_box']['xmax'] > w_int_4:
+                    if modules11 == []:
+                        line_xmax_1 = modules_xmin[i]['bounding_box']['xmax'] + 20
+                        line_xmax_2 = modules_xmin[i + 1]['bounding_box']['xmin'] - 20
+                    else:
+                        modules11.append(modules_xmin[i]['bounding_box']['xmax'])
+                        modules11_xmax = sorted(modules11)[-1]
+                        line_xmax_1 = modules11_xmax + 20
+                        line_xmax_2 = modules_xmin[i + 1]['bounding_box']['xmin'] - 20
+                        modules11 = []
+                    temp1 = 1
+                    temp2 = 1
+                    break
+                elif modules_xmin[i + 1]['bounding_box']['xmin'] - modules_xmin[i]['bounding_box']['xmax'] > -w_int_8:
+                    if temp1 == 0:
+                        if modules11 == []:
+                            line_xmax_1 = int((modules_xmin[i + 1]['bounding_box']['xmin'] +
+                                               modules_xmin[i]['bounding_box']['xmax']) / 2)
+                        else:
+                            modules11.append(modules_xmin[i]['bounding_box']['xmax'])
+                            modules11_xmax = sorted(modules11)[-1]
+                            line_xmax_1 = int((modules_xmin[i + 1]['bounding_box']['xmin'] +
+                                               modules11_xmax) / 2)
+                            modules11 = []
+                        temp1 = 1
+                    elif temp1 == 1:
+                        if modules11 == []:
+                            line_xmax_2 = int((modules_xmin[i + 1]['bounding_box']['xmin'] +
+                                               modules_xmin[i]['bounding_box']['xmax']) / 2)
+                        else:
+                            modules11.append(modules_xmin[i]['bounding_box']['xmax'])
+                            modules11_xmax = sorted(modules11)[-1]
+                            line_xmax_2 = int((modules_xmin[i + 1]['bounding_box']['xmin'] +
+                                               modules11_xmax) / 2)
+                        temp2 = 1
+                else:
+                    modules11.append(modules_xmin[i]['bounding_box']['xmax'])
+
+        if temp1 == 0 and temp2 == 0:
+            if modules_xmax[-1]['bounding_box']['xmax'] - w < -(2 * w_int_4):
+                line_xmax_1 = modules_xmax[-1]['bounding_box']['xmax'] + 20
+                line_xmax_2 = 2 * w_int_3
+            elif modules_xmax[-1]['bounding_box']['xmax'] - w < -w_int_4:
+                line_xmax_1 = modules_xmax[-1]['bounding_box']['xmax'] + 20
+        elif temp1 == 1 and temp2 == 0:
+            if modules_xmax[-1]['bounding_box']['xmax'] - w < -w_int_4:
+                line_xmax_2 = 2 * w_int_3
+
+    return line_xmax_1, line_xmax_2
+
+
+def get_intersection_point(lines, orthogonal_lines, border):
+    intersect_point_list = []
+    for line in lines:
+        width_min, height_min, width_max, height_max = border
+        (x_l, y_u), (x_r, y_d) = line.coords
+
+        x_l = x_l if x_l > width_min else width_min + 1  # 避免边界
+        x_r = x_r if x_r < width_max else width_max - 1
+        y_u = y_u if y_u > height_min else height_min + 1
+        y_d = y_d if y_d < height_max else height_max - 1
+
+        points_list = []
+        if x_l == x_r:
+            line_direction = 'lon'
+            raw_line = LineString([(x_l, y_u), (x_r, y_d)])
+            extend_line = LineString([(x_l, height_min), (x_r, height_max)])
+            points_list.extend([height_min + 1, height_max - 1])  # 延长线与边界交点,并避免key_point位于现有边界上
+            line_start, line_end = y_u, y_d
+        else:
+            line_direction = 'lat'
+            raw_line = LineString([(x_l, y_u), (x_r, y_d)])
+            extend_line = LineString([(width_min, y_u), (width_max, y_d)])
+            points_list.extend([width_min + 1, width_max - 1])  # 延长线与边界交点,并避免key_point位于现有边界上
+            line_start, line_end = x_l, x_r
+
+        for ele in orthogonal_lines:
+            cond1 = extend_line.intersects(ele)  # T, L, 十交叉
+            cond2 = extend_line.crosses(ele)  # 十字交叉
+            cond3 = raw_line.intersects(ele)
+            cond4 = raw_line.crosses(ele)
+            if line_direction == 'lat':
+                if cond3:
+                    (xp, yp) = raw_line.intersection(ele).bounds[:2]
+                    intersect_point_list.append((xp, yp))
+                elif cond1:
+                    (xp, yp) = extend_line.intersection(ele).bounds[:2]
+                    points_list.append(xp)
+
+            if line_direction == 'lon':
+                if cond3:
+                    (xp, yp) = raw_line.intersection(ele).bounds[:2]
+                    intersect_point_list.append((xp, yp))
+                elif cond1:
+                    (xp, yp) = extend_line.intersection(ele).bounds[:2]
+                    points_list.append(yp)
+
+        points_array = np.asarray(points_list, dtype=np.uint)
+
+        left_key = np.max(points_array[points_array <= line_start])
+        right_key = np.min(points_array[points_array >= line_end])  # 延长线两边延长并取得第一个交点
+
+        if line_direction == 'lat':
+            intersect_point = [(left_key, y_u), (right_key, y_d)]
+        else:
+            intersect_point = [(x_l, left_key), (x_r, right_key)]
+
+        # print(intersect_point)
+        intersect_point_list.extend(intersect_point)
+
+    return intersect_point_list
+
+
+def infer_sheet_box(image, sheet_dict, lon_split_line, exclude_classes):
+    height_max, width_max = image.shape[0], image.shape[1]
+    height_min, width_min = 0, 0
+    latitude = []
+    longitude = []
+    lines = []
+    sheet_polygons = []
+    all_sheet_polygons = []
+    choice_polygon = []
+    # exclude_classes = ['cloze_s', 'exam_number_s', 'choice_s', 'type_score',
+    #                    'mark', 'page', 'exam_number_s', 'cloze_score', 'name_w',
+    #                    'class_w',]
+
+    h_min = []
+    h_max = []
+    for index, region_box in enumerate(sheet_dict):
+        coordinates = region_box['bounding_box']
+        xmin = coordinates['xmin']
+        ymin = coordinates['ymin']
+        xmax = coordinates['xmax']
+        ymax = coordinates['ymax']
+
+        if region_box['class_name'] == 'info_title':  # 上限
+            h_min.append(ymin)
+        if region_box['class_name'] == 'page':  # 下限
+            h_max.append(ymin)
+        if region_box['class_name'] == 'alarm_info':
+            h_min.append(ymin)
+            h_max.append(ymin)
+
+    if h_min:
+        hgt_min = min(h_min)
+        if hgt_min < height_max / 4:
+            height_min = hgt_min
+    if h_max:
+        hgt_max = max(h_max)
+        if hgt_max > 3 * height_max / 4:
+            height_max = hgt_max
+
+    # height_min = h_min if h_min != 9999 else height_min
+    # height_max = h_max if h_max != 0 else height_max
+    for index, region_box in enumerate(sheet_dict):
+        coordinates = region_box['bounding_box']
+        xmin = coordinates['xmin']
+        ymin = coordinates['ymin']
+        xmax = coordinates['xmax']
+        ymax = coordinates['ymax']
+        box_polygon = Polygon([(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)])
+
+        if region_box['class_name'] not in exclude_classes:
+            if region_box['class_name'] not in ['choice', 'cloze']:  # 推断选择题区域内的choice_m
+                sheet_polygons.append(box_polygon)
+            if region_box['class_name'] == 'choice':
+                choice_polygon.append(box_polygon)
+            all_sheet_polygons.append(box_polygon)
+            line1 = LineString([(xmin, ymin), (xmin, ymax)])
+            line2 = LineString([(xmax, ymin), (xmax, ymax)])
+            line3 = LineString([(xmin, ymin), (xmax, ymin)])
+            line4 = LineString([(xmin, ymax), (xmax, ymax)])
+
+            lines.extend([line1, line2, line3, line4])
+
+            longitude.extend([line1, line2])
+            latitude.extend([line3, line4])
+
+    # sheet_polygons 去除包裹的情况
+    sheet_polygons_ = list(combinations(sheet_polygons, 2))
+    for polygons in sheet_polygons_:
+        if polygons[0].within(polygons[1]) or polygons[0].contains(polygons[1]):
+            area_list = [polygons[0].area, polygons[1].area]
+            min_polygon = polygons[area_list.index(min(area_list))]
+            if min_polygon in sheet_polygons:
+                sheet_polygons.remove(min_polygon)
+
+    min_polygon = sorted(all_sheet_polygons, key=lambda p: p.area)[0]
+    avg_area = sum([polygon.area for polygon in sheet_polygons]) / len(sheet_polygons)
+
+    # 所有矩形框的延长线与矩形框集图像边界的交点
+    latitude = sorted(latitude, key=lambda x: x.bounds[1])  # y
+    longitude = sorted(longitude, key=lambda x: x.bounds[0])  # x
+
+    lat_intersect_point_list = get_intersection_point(latitude, longitude,
+                                                      (width_min, height_min, width_max, height_max))
+    lon_intersect_point_list = get_intersection_point(longitude, latitude,
+                                                      (width_min, height_min, width_max, height_max))
+
+    raw_corner = [(width_min + 1, height_min + 1), (width_min + 1, height_max - 1), (width_max - 1, 1),
+                  (width_max - 1, height_max - 1)]
+    # raw_corner = []
+    intersect_point_list = lat_intersect_point_list + lon_intersect_point_list + raw_corner
+    intersect_point_list = list(set(intersect_point_list))
+    intersect_point_dict = {k: index + 1 for index, k in enumerate(intersect_point_list)}
+
+    def _filter_rect(p_list):
+        flag = 0
+        for ele in p_list:
+            try:
+                flag = intersect_point_dict[ele]
+            except KeyError:
+                flag = 0
+                break
+        if flag > 0:
+            x_c = sum([ele[0] for ele in p_list]) / 4
+            y_c = sum([ele[1] for ele in p_list]) / 4
+            d1, d2, d3, d4 = [LineString([p, (x_c, y_c)]).length for p in p_list]
+            return (0 not in [d1, d2, d3, d4]) and d1 == d2 and d1 == d3 and d1 == d4
+        else:
+            return False
+
+    def _find_rect(point):
+        (x1, y1) = point[0]
+        (x2, y2) = point[1]
+        if x1 != x2 and y1 != y2:
+            xmin, ymin = min(x1, x2), min(y1, y2)
+            xmax, ymax = max(x1, x2), max(y1, y2)
+
+            points_4 = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)]
+            w, h = xmax - xmin, ymax - ymin
+            aspect_flag_extreme = max(w / h, h / w) < 1.5 * ASPECT_FLAG  # 解决极端情况
+            rect_flag = _filter_rect(points_4)
+            if aspect_flag_extreme and rect_flag:
+                gen_polygon = Polygon([(points_4[0]), (points_4[1]), (points_4[2]), (points_4[3])])
+                flags = set()
+                for polygon in sheet_polygons:
+                    decision = [gen_polygon.contains(polygon),
+                                gen_polygon.within(polygon),
+                                gen_polygon.overlaps(polygon)]
+                    if True in decision:  # 边界问题
+                        flags.add(False)
+                        break
+                    else:
+                        flags.add(True)
+                if False in flags:
+                    pass
+                else:
+                    return gen_polygon
+
+    def _filter_none(p):
+        if p is not None:
+            return True
+
+    points_2 = combinations(intersect_point_list, 2)
+    gen_polygon_list = map(_find_rect, points_2)
+
+    gen_polygon_list = list(filter(_filter_none, gen_polygon_list))
+    gen_polygon_list = sorted(gen_polygon_list, key=lambda p: p.area, reverse=True)
+    # gen_polygon_list = [polygon for index, polygon in enumerate(gen_polygon_list) if index % 2 == 0]
+    it = itertools.groupby(gen_polygon_list)
+    gen_polygon_list = [k for k, g in it]
+
+    # 在选择题区域的infer polygon
+    gen_choice = []
+    for ele in gen_polygon_list:
+        for choice_p in choice_polygon:
+            if ele.within(choice_p):
+                gen_choice.append(ele)
+
+    sheet_box_area = sum([polygon.area for polygon in sheet_polygons])
+    image_area = width_max * height_max
+    blank_ratio = 1 - sheet_box_area / image_area
+
+    polygon_index = 0
+    include_polygon = []
+    while blank_ratio > REMAIN_RATIO and polygon_index < len(gen_polygon_list):
+        polygon = gen_polygon_list[polygon_index]
+        blank_ratio = blank_ratio - polygon.area / image_area
+        include_polygon.append(polygon)
+        polygon_index += 1
+
+    # gen_polygon_list = [polygon for index, polygon in enumerate(gen_polygon_list)
+    #                     if polygon.area > 1.5 * min_polygon.area]
+
+    for polygon in gen_polygon_list.copy():
+        xi, yi, xx, yx = polygon.bounds
+        w, h = xx - xi, yx - yi
+        if polygon.area <= 1.5 * min_polygon.area or h / w > 2 and polygon.area < avg_area:
+            gen_polygon_list.remove(polygon)
+
+    polygon_2 = list(combinations(gen_polygon_list, 2))
+    for polygons in polygon_2:
+        try:
+            cond2 = polygons[0].overlaps(polygons[1])  # 叠置关系二次分段
+            if cond2:
+                area_list = [polygons[0].area, polygons[1].area]
+                min_index = area_list.index(min(area_list))
+                smaller_polygon = polygons[min_index]
+                larger_polygon = polygons[1 - min_index]
+                new_polygon = smaller_polygon.difference(larger_polygon)
+
+                if smaller_polygon in gen_polygon_list:
+                    gen_polygon_list.remove(smaller_polygon)
+                    if 'MultiPolygon' in str(type(new_polygon)):
+                        for ele in new_polygon:
+                            xm, ym, xx, yx = ele.bounds
+                            w, h = xx - xm, yx - ym
+                            if max(w / h, h / w) < 1.5 * ASPECT_FLAG and ele.area > 1.5 * min_polygon.area:
+                                gen_polygon_list.append(ele)
+                    elif len(set(new_polygon.exterior.coords)) == 4:
+                        xm, ym, xx, yx = new_polygon.bounds
+                        w, h = xx - xm, yx - ym
+                        if max(w / h, h / w) < 1.5 * ASPECT_FLAG and new_polygon.area > 1.5 * min_polygon.area:
+                            gen_polygon_list.append(new_polygon)
+        except Exception as polygon_e:
+            print(polygon_e)
+            continue
+
+    polygon_2 = list(combinations(gen_polygon_list, 2))  # 包含关系取大值
+    for polygons in polygon_2:
+        cond1 = polygons[0].equals(polygons[1])
+        if cond1 and polygons[1] in gen_polygon_list:
+            gen_polygon_list.remove(polygons[1])
+
+    polygon_2 = list(combinations(gen_polygon_list, 2))
+    for polygons in polygon_2:
+        cond2 = polygons[0].contains(polygons[1]) or polygons[0].within(polygons[1])
+        if cond2:
+            area_list = [polygons[0].area, polygons[1].area]
+            min_index = area_list.index(min(area_list))
+
+            smaller_polygon = polygons[min_index]
+            larger_polygon = polygons[1 - min_index]
+            sxi, syi, sxx, syx = smaller_polygon.bounds
+            bxi, byi, bxx, byx = larger_polygon.bounds
+            # inner_touch_cond = '212F11FF2' == larger_polygon.relate(smaller_polygon)
+            two_side_touch_cond = (sxi == bxi and sxx == bxx) or (syi == byi and syx == byx)
+            if two_side_touch_cond:
+                dif_polygon = larger_polygon.difference(smaller_polygon)
+                if larger_polygon in gen_polygon_list:
+                    gen_polygon_list.remove(larger_polygon)
+                if 'MultiPolygon' in str(type(dif_polygon)):
+                    for ele in dif_polygon:
+                        xm, ym, xx, yx = ele.bounds
+                        w, h = xx - xm, yx - ym
+                        if max(w / h, h / w) < 1.5 * ASPECT_FLAG and ele.area > 1.5 * min_polygon.area:
+                            gen_polygon_list.append(ele)
+                elif len(set(dif_polygon.exterior.coords)) == 4:  # empty
+                    xm, ym, xx, yx = dif_polygon.bounds
+                    w, h = xx - xm, yx - ym
+                    if max(w / h, h / w) < 1.5 * ASPECT_FLAG and dif_polygon.area > 1.5 * min_polygon.area:
+                        gen_polygon_list.append(dif_polygon)
+            else:
+                if smaller_polygon in gen_polygon_list:
+                    gen_polygon_list.remove(smaller_polygon)
+
+    polygon_2 = list(combinations(gen_polygon_list, 2))  # 包含关系取大值
+    for polygons in polygon_2:
+        cond1 = polygons[0].equals(polygons[1])
+        if cond1 and polygons[1] in gen_polygon_list:
+            gen_polygon_list.remove(polygons[1])
+
+    if len(lon_split_line) > 0:
+        for line in lon_split_line:
+            # line = LineString([(286, 1), (286, 599)])
+            for poly in gen_polygon_list.copy():
+                cond1 = line.intersects(poly)
+                cond2 = line.touches(poly)
+                if cond1 and not cond2:
+                    dif_polygons = poly.difference(line)
+                    corner_list = list(set(dif_polygons.exterior.coords))
+                    sorted_corner_list = sorted(corner_list, key=lambda x: x[0])
+                    if len(sorted_corner_list) == 6:
+                        left = sorted(sorted_corner_list[0:2], key=lambda x: x[1])
+                        middle = sorted(sorted_corner_list[2:4], key=lambda x: x[1])
+                        right = sorted(sorted_corner_list[4:6], key=lambda x: x[1])
+
+                        tmp_corner_list = [middle[0], left[0], left[1], middle[1], right[1], right[0], middle[0]]
+
+                        polygon1 = Polygon(tmp_corner_list[:4])
+                        polygon2 = Polygon(tmp_corner_list[3:])
+
+                        gen_polygon_list.remove(poly)
+                        for p in [polygon1, polygon2]:
+                            xi, yi, xx, yx = p.bounds
+                            w, h = xx - xi, yx - yi
+                            aspect_flag = max(w / h, h / w) < ASPECT_FLAG
+                            if aspect_flag:
+                                gen_polygon_list.append(p)
+
+    gen_polygon_list = [polygon for index, polygon in enumerate(gen_polygon_list) if polygon.area > min_polygon.area]
+
+    if gen_choice:
+        gen_choice = sorted(gen_choice, key=lambda x: x.area)[-1]
+        gen_polygon_list.append(gen_choice)
+    return gen_polygon_list
+
+
+def infer_class(image, sheet_dict_list, infer_polygon, image_cols, ocr_dict_list=''):
+    res = []
+    all_type_score_polygon = []
+    all_choice_polygon = []
+    all_cloze_polygon = []
+    all_solve_polygon = []
+    all_choice_s_width = []
+    for region_box in sheet_dict_list:
+        if region_box['class_name'] in ['type_score', 'choice', 'cloze', 'solve', 'choice_s']:
+            coordinates = region_box['bounding_box']
+            xmin = coordinates['xmin']
+            ymin = coordinates['ymin']
+            xmax = coordinates['xmax']
+            ymax = coordinates['ymax']
+            box_polygon = Polygon([(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)])
+            if region_box['class_name'] == 'type_score':
+                all_type_score_polygon.append(box_polygon)
+
+            if region_box['class_name'] == 'choice':
+                all_choice_polygon.append(box_polygon)
+
+            if region_box['class_name'] == 'cloze':
+                all_cloze_polygon.append(box_polygon)
+
+            if region_box['class_name'] == 'solve':
+                all_solve_polygon.append(box_polygon)
+
+            if region_box['class_name'] == 'choice_s':
+                all_choice_s_width.append(int(xmax)-int(xmin))
+
+    for poly in infer_polygon.copy():  # infer type_score solve
+        p_xmin, p_ymin, p_xmax, p_ymax = poly.bounds
+        type_score_num = 0
+        type_score_ymin = []
+        for type_score_polygon in all_type_score_polygon:
+            cond1 = type_score_polygon.within(poly)
+            cond2 = False
+            cond3 = type_score_polygon.overlaps(poly)
+            if cond3:
+                intersection_poly = type_score_polygon.intersection(poly)
+                d1 = intersection_poly.area / type_score_polygon.area >= TYPE_SCORE_MNS
+                print('type_score:', intersection_poly.area / type_score_polygon.area)
+                d2 = type_score_polygon.area < 0.2 * poly.area
+                cond2 = d1 and d2
+
+            if cond1 or cond2:
+                type_score_num += 1
+                t_xmin, t_ymin, t_xmax, t_ymax = type_score_polygon.bounds
+                type_score_ymin.append(t_ymin)
+                t_height = t_ymax - t_ymin
+                if t_ymin - p_ymin > 3 * t_height:
+                    type_score_num += 1
+                    type_score_ymin.append(p_ymin)
+
+        if type_score_num == 1:
+            in_xmin, in_ymin, in_xmax, in_ymax = poly.bounds
+            solve_box = {'class_name': 'solve',
+                         'bounding_box': {'xmin': int(in_xmin), 'ymin': int(in_ymin),
+                                          'xmax': int(in_xmax), 'ymax': int(in_ymax)}}
+
+            sheet_dict_list.append(solve_box)
+            infer_polygon.remove(poly)
+            res.append(solve_box)
+        if type_score_num > 1:  # 多type_score
+            type_score_ymin = sorted(type_score_ymin)
+            type_score_ymin[0] = min(p_ymin, type_score_ymin[0])
+            type_score_ymin.append(p_ymax)
+            for i in range(0, len(type_score_ymin) - 1):
+                w = p_xmax - p_xmin
+                h = type_score_ymin[i + 1] - type_score_ymin[i]
+                if max(w / h, h / w) < ASPECT_FLAG:
+                    solve_box = {'class_name': 'solve',
+                                 'bounding_box': {'xmin': int(p_xmin), 'ymin': int(type_score_ymin[i]),
+                                                  'xmax': int(p_xmax), 'ymax': int(type_score_ymin[i + 1])}}
+                    sheet_dict_list.append(solve_box)
+                    res.append(solve_box)
+            infer_polygon.remove(poly)
+
+    # for poly in infer_polygon.copy():  # infer choice_m
+    #     for choice_polygon in all_choice_polygon:
+    #         cond1 = choice_polygon.within(poly) or choice_polygon.contains(poly)
+    #         cond2 = False
+    #         cond3 = choice_polygon.overlaps(poly)
+    #         if cond3:
+    #             intersection_poly = choice_polygon.intersection(poly)
+    #             cond2 = intersection_poly.area / poly.area >= 0.8
+    #
+    #         if cond1 or cond2:
+    #             in_xmin, in_ymin, in_xmax, in_ymax = poly.bounds
+    #             choice_m_img = crop_region_direct(image, (int(in_xmin), int(in_ymin),
+    #                                                       int(in_xmax), int(in_ymax)))
+    #             # cv2.imshow('m', choice_m_img)
+    #             # cv2.waitKey(0)
+    #             ocr_res = get_ocr_text_and_coordinate(choice_m_img)
+    #             char_a_min = []
+    #             char_d_max = []
+    #             for index, chars in enumerate(ocr_res):
+    #                 for char in chars['chars']:
+    #                     left, top = char['location']['left'], char['location']['top']
+    #                     width, height = char['location']['width'], char['location']['height']
+    #                     if char['char'] in 'abcdlABCD[]aabbccddAABBCCDD[[]]':
+    #                         xm, ym = int(left - width / 2), int(top - height / 2)
+    #                         char_a_min.append((xm, ym))
+    #                         xx, yx = int(left + 3 * width / 2), int(top + 3 * height / 2)
+    #                         char_d_max.append((xx, yx))
+    #             if char_a_min and char_d_max:
+    #                 char_a_min_arr, char_d_max_arr = np.array(char_a_min), np.array(char_d_max)
+    #                 tmp_min = np.min(char_a_min_arr, axis=0)
+    #                 tmp_max = np.max(char_d_max_arr, axis=0)
+    #
+    #                 m_xmin, m_ymin, m_xmax, m_ymax = tmp_min[0], tmp_min[1], tmp_max[0], tmp_max[1]
+    #                 dif_width = sum(all_choice_s_width) // len(all_choice_s_width) - (m_xmax - m_xmin)
+    #                 choice_box = {'class_name': 'choice_m',
+    #                               'bounding_box': {'xmin': int(m_xmin) + int(in_xmin) - dif_width // 2,
+    #                                                'ymin': int(m_ymin) + int(in_ymin),
+    #                                                'xmax': int(m_xmax) + int(in_xmin) + dif_width // 2,
+    #                                                'ymax': int(m_ymax) + int(in_ymin)
+    #                                                }}
+    #
+    #                 sheet_dict_list.append(choice_box)
+    #                 infer_polygon.remove(poly)
+    #                 res.append(choice_box)
+    #                 break
+
+    for poly in infer_polygon.copy():  # infer ocr blank
+        flag = []
+        for ocr in ocr_dict_list:
+            location = ocr['location']
+            xmin = location['left']
+            ymin = location['top']
+            xmax = location['left'] + location['width']
+            ymax = location['top'] + location['height']
+            box_polygon = Polygon([(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)])
+            cond1 = poly.within(box_polygon) or poly.contains(box_polygon)
+            cond2 = False
+            cond3 = box_polygon.overlaps(poly)
+            if cond3:
+                intersection_poly = box_polygon.intersection(poly)
+                cond2 = intersection_poly.area / poly.area >= 0.2
+
+            flag.append(cond1 or cond2 or False)  # True 不是blank
+        if True not in flag:
+            in_xmin, in_ymin, in_xmax, in_ymax = poly.bounds
+            blank_box = {'class_name': 'blank',
+                         'bounding_box': {'xmin': int(in_xmin), 'ymin': int(in_ymin),
+                                          'xmax': int(in_xmax), 'ymax': int(in_ymax)}}
+
+            # sheet_dict_list.append(solve_box)
+            infer_polygon.remove(poly)
+            res.append(blank_box)
+
+    for poly in infer_polygon.copy():  # infer blank
+        bounds = [int(ele) for ele in poly.bounds]
+        img_region = crop_region_direct(image, bounds)
+        img = cv2.threshold(img_region, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+        img_mean = np.mean(img)
+
+        img_raw_mean = np.mean(img_region)
+        # print(img_mean, img_raw_mean)
+        cond = img_mean < PIX_VALUE_LOW or img_raw_mean > PIX_VALUE_HIGH
+        if cond:
+            in_xmin, in_ymin, in_xmax, in_ymax = bounds
+            blank_box = {'class_name': 'blank',
+                         'bounding_box': {'xmin': int(in_xmin), 'ymin': int(in_ymin),
+                                          'xmax': int(in_xmax), 'ymax': int(in_ymax)}}
+
+            # sheet_dict_list.append(solve_box)
+            infer_polygon.remove(poly)
+            res.append(blank_box)
+
+    # for poly in infer_polygon.copy():  # infer cloze_s
+    #     for cloze_polygon in all_cloze_polygon:
+    #         cond1 = cloze_polygon.within(poly) or cloze_polygon.contains(poly)
+    #         cond2 = False
+    #         cond3 = cloze_polygon.overlaps(poly)
+    #         if cond3:
+    #             intersection_poly = cloze_polygon.intersection(poly)
+    #             cond2 = intersection_poly.area / poly.area >= 0.8
+    #
+    #         if cond1 or cond2:
+    #             in_xmin, in_ymin, in_xmax, in_ymax = poly.bounds
+    #             solve_box = {'class_name': 'cloze_s',
+    #                          'bounding_box': {'xmin': int(in_xmin), 'ymin': int(in_ymin),
+    #                                           'xmax': int(in_xmax), 'ymax': int(in_ymax)}}
+    #
+    #             sheet_dict_list.append(solve_box)
+    #             infer_polygon.remove(poly)
+    #             res.append(solve_box)
+    #             break
+
+    for poly in infer_polygon.copy():  # infer solve
+        in_xmin, in_ymin, in_xmax, in_ymax = poly.bounds
+        w, h = in_xmax - in_xmin, in_ymax - in_ymin
+        aspect_flag = max(w / h, h / w) < ASPECT_FLAG
+        if aspect_flag:
+            solve_box = {'class_name': 'solve_infer',
+                         'bounding_box': {'xmin': int(in_xmin), 'ymin': int(in_ymin),
+                                          'xmax': int(in_xmax), 'ymax': int(in_ymax)}}
+        else:
+            solve_box = {'class_name': 'blank',
+                         'bounding_box': {'xmin': int(in_xmin), 'ymin': int(in_ymin),
+                                          'xmax': int(in_xmax), 'ymax': int(in_ymax)}}
+
+        sheet_dict_list.append(solve_box)
+        infer_polygon.remove(poly)
+        res.append(solve_box)
+
+    if all_type_score_polygon:
+        type_score_area = sum([ele.area for ele in all_type_score_polygon])
+        mean_type_score_area = type_score_area/len(all_type_score_polygon)
+        solve_filter = []
+        for index, sheet_box in enumerate(sheet_dict_list.copy()):
+            if sheet_box['class_name'] == 'solve_infer':
+                w = sheet_box['bounding_box']['xmax'] - sheet_box['bounding_box']['xmin']
+                h = sheet_box['bounding_box']['ymin'] - sheet_box['bounding_box']['ymin']
+                if w * h < mean_type_score_area * 3:
+                    sheet_dict_list.remove(sheet_box)
+
+    for ele in sheet_dict_list:
+        if ele['class_name'] == 'solve_infer':
+            ele.update({'class_name': 'solve'})
+
+    return sheet_dict_list
+
+
+def box_infer_and_complete(image, sheet_region_dict, ocr=''):
+    if len(image.shape) == 3:
+        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+    if len(image.shape) == 4:
+        image = cv2.cvtColor(image, cv2.COLOR_RGBA2GRAY)
+    exclude_classes = [
+        'cloze_s',
+        'exam_number_s',
+        'type_score',
+        'page',
+        'alarm_info',
+        # 'score_collect',
+        'choice_s',
+    ]
+    y, x = image.shape[0], image.shape[1]
+    x1, x2 = subfield_answer_sheet(image, sheet_region_dict)
+
+    # lon_split_line = []
+    lon_split_line = [LineString([(px, 1), (px, y - 1)]) for px in [x1, x2] if px != 0]
+    split_line_poly = [(px, 1, px + 1, y - 1) for px in [x1, x2] if px != 0]
+
+    poly_list = infer_sheet_box(image, sheet_region_dict, lon_split_line, exclude_classes)
+    image_cols = len(lon_split_line) + 1
+    sheet_region_dict = infer_class(image, sheet_region_dict, poly_list, image_cols, ocr)
+
+    return sheet_region_dict
+
+
+# 选择题区域补全
+def _get_split_index(sorted_list, spilt_value):
+    y_dif_list = np.array(sorted_list[1:]) - np.array(sorted_list[:-1])
+    y_split_index = [index for index, ele in enumerate(y_dif_list) if ele >= spilt_value]
+
+    y_split_index = [ele + 1 for ele in y_split_index]  # 索引值扩大
+    y_split_index.insert(0, 0)
+    y_split_index.insert(-1, len(sorted_list))
+    y_split_index = sorted(list(set(y_split_index)))
+
+    return y_split_index
+
+
+def get_letter_group(letter, location_list):
+    y_list = sorted([ele['location']['top'] for ele in location_list])
+    height = np.mean(np.array([ele['location']['height'] for ele in location_list]))
+    width = np.mean(np.array([ele['location']['width'] for ele in location_list]))
+    y_split_dif, x_split_dif = height * 1.5, width * 1.5
+
+    y_split_index = _get_split_index(y_list, y_split_dif)
+
+    letter_group_list = []
+    letter_group_location_list = []
+    for i, split in enumerate(y_split_index[1:]):
+        one_group_location_list = location_list[y_split_index[i]:y_split_index[i + 1]]
+        one_group_x_list = sorted([ele['location']['top'] for ele in one_group_location_list])
+        one_group_x_split_index = _get_split_index(one_group_x_list, x_split_dif)
+
+        block = []
+        block_location = []
+        for i_i, s_split in enumerate(one_group_x_split_index[1:]):
+            letter_group = one_group_location_list[one_group_x_split_index[i_i]:
+                                                   one_group_x_split_index[i_i + 1]]
+            letter_group = sorted(letter_group, key=lambda k: k.get('location')['top'])
+
+            xmin = min([ele['location']['left'] for ele in letter_group])
+            ymin = min([ele['location']['top'] for ele in letter_group])
+            xmax = max([ele['location']['left'] for ele in letter_group]) + width
+            ymax = max([ele['location']['top'] for ele in letter_group]) + height
+            middle_x, middle_y = (xmax - xmin) / 2 + xmin, (ymax - ymin) / 2 + ymin
+            block_location.append((xmin, ymin, xmax, ymax, middle_x, middle_y))
+            block.append(letter_group)
+
+        letter_group_list.append(block)
+        letter_group_location_list.append(block_location)
+
+    res_dict = {'letter': letter,
+                'letter_group': letter_group_list,
+                'letter_group_location': letter_group_location_list,
+                'width': width, 'height': height}
+
+    return res_dict
+
+
+def get_letter_group_h(letter, location_list):
+    location_list = sorted(location_list, key=lambda k: k.get('location')['left'])
+    x_list = sorted([ele['location']['left'] for ele in location_list])
+    height = np.mean(np.array([ele['location']['height'] for ele in location_list]), dtype=np.uint)
+    width = np.mean(np.array([ele['location']['width'] for ele in location_list]), dtype=np.uint)
+    print('h, w: ', height, width)
+    y_split_dif, x_split_dif = height * 1.5, width * 1.5
+
+    x_split_index = _get_split_index(x_list, x_split_dif)
+
+    letter_group_location_list = []
+    for i, split in enumerate(x_split_index[1:]):
+        one_group_location_list = location_list[x_split_index[i]:x_split_index[i + 1]]
+        one_group_location_list = sorted(one_group_location_list, key=lambda k: k.get('location')['top'])
+        xmin = min([ele['location']['left'] for ele in one_group_location_list])
+        ymin = one_group_location_list[0]['location']['top']
+        xmax = xmin + width
+        ymax = one_group_location_list[-1]['location']['top'] + 2*one_group_location_list[-1]['location']['height']
+        letter_group_location_list.append((xmin - 2*width, ymin,
+                                           xmax + 2*width, ymax))
+
+    return {'letter': letter, 'group_location': letter_group_location_list}
+
+
+def infer_choice_m_by_ocr(ocr_dict_list):
+    # 若字母识别漏掉结果太多, 此方法不能使用
+    a_e = 'ABCDEF'
+    pattern = '[ABCDEF]'
+    a_e_dict = {k: [] for k in a_e}
+    block_num = 1  # default
+    for i, ele in enumerate(ocr_dict_list):
+        words = ele['words']
+        cal_num = max([words.upper().count(char) for char in a_e])
+        if cal_num > 0:
+            words = words.replace(' ', '').upper()  # 去除空格,baidu_api bug
+            abcd_words_m = re.finditer(pattern, words)
+            abcd_index_list = [(m.group(), m.span()) for m in abcd_words_m if m]
+            for letter_info in abcd_index_list:
+                letter = letter_info[0]
+                a_e_dict[letter].append(ele['chars'][letter_info[1][0]])
+
+    letter_group_list = []
+    for k, v in a_e_dict.items():
+        if v:
+            letter_group = get_letter_group_h(k, v)
+            block_num = max(block_num, len(letter_group['group_location']))
+            print(letter_group)
+            letter_group_list.append(letter_group)
+
+    choice_m_list = []
+    for i in range(0, block_num):
+        block = []
+        for letter_group in letter_group_list:
+            if len(letter_group['group_location']) > i:
+                block.append(letter_group['group_location'][i])
+
+        if block:
+            block_array = np.asarray(block)
+            b_min = np.min(block_array, axis=0)
+            b_max = np.max(block_array, axis=0)
+            choice_m_dict = {'class_name': 'choice_m',
+                             'location': {'xmin': b_min[0], 'ymin': b_min[1],
+                                          'xmax': b_max[2], 'ymax': b_max[3]}}
+            choice_m_list.append(choice_m_dict)
+
+    # print(choice_m_list)
+    return choice_m_list

+ 534 - 0
segment/sheet_resolve/analysis/sheet/sheet_points.py

@@ -0,0 +1,534 @@
+# @Author  : lightXu
+# @File    : sheet_points.py
+# @Time    : 2019/9/12 0012 下午 14:39
+import re
+import numpy as np
+from segment.sheet_resolve.tools import utils
+from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate
+OCR_ACCURACY = 'accurate'
+
+
+def split_col(box_list, width):
+    if len(box_list) == 1:
+        return [box_list]
+    else:
+        box_list = sorted(box_list, key=lambda k: k.get('bounding_box')['xmin'])
+        box_list_class_name = [[ele_box['bounding_box']['xmin'], ele_box['bounding_box']['ymin'],
+                                ele_box['bounding_box']['xmax'], ele_box['bounding_box']['ymax']] for ele_box in
+                               box_list]
+        box_list_raw = sorted(box_list_class_name, key=lambda k: k[0])
+        box_list_raw_arr = np.array(box_list_raw)
+
+        pre = box_list_raw_arr[1:, 0]
+        rear = box_list_raw_arr[:-1, 0]
+        y_diff = pre - rear
+        index_list = [index for index, ele in enumerate(y_diff) if abs(ele) > width // 10]
+
+        res_list = []
+        if index_list == []:
+            res_list = [sorted(box_list, key=lambda k: k.get('bounding_box')['ymin'])]
+            return res_list
+        else:
+            split_x_index = [ele + 1 for ele in index_list]
+            split_x_index.insert(0, 0)
+            split_x_index.insert(-1, len(box_list_raw))
+            split_x_index = sorted(list(set(split_x_index)))
+            for i, split in enumerate(split_x_index[1:]):
+                one_col = box_list[split_x_index[i]:split_x_index[i + 1]]
+                one_col = sorted(one_col, key=lambda k: k.get('bounding_box')['ymin'])
+                res_list.append(one_col)
+        return res_list
+
+
+def check_classes(sheet_dict_of_front):
+    sheet_dict_of_front_temp = sheet_dict_of_front.copy()
+    index_list = []
+    for index, ele in enumerate(sheet_dict_of_front_temp):
+        for index0, one_paper in enumerate(ele):
+            if one_paper['title_number'] == -1 and one_paper['title_with_value'] != -1:
+                index_list.append(index0)
+    # print(index_list)
+    if index_list == []:
+        return sheet_dict_of_front
+    else:
+        if 0 in index_list and len(index_list) > 2:
+            index_list = index_list[1:]
+        elif 0 in index_list and len(index_list) == 1:
+            return sheet_dict_of_front
+        else:
+            index_list = index_list
+        for index_a, ele_a in enumerate(sheet_dict_of_front):
+            for index1, ele1 in enumerate(index_list):
+                former_class = ele_a[ele1 - 1]
+                if former_class != []:
+                    lack_title_number_dict = ele_a[ele1]
+                    lack_title_number_dict['title_number'] = former_class['title_number']
+                    lack_title_number_dict.update({'title_number': former_class['title_number']})
+        return sheet_dict_of_front
+
+
+def change_box(cloze_s_res, cloze_s_region):
+    words_result = []
+    for ele in cloze_s_res:
+        location = ele['location']
+        xmin = location['left']
+        ymin = location['top']
+        xmax = location['left'] + location['width']
+        ymax = location['top'] + location['height']
+        bbox0 = utils.get_img_region_box01([xmin, ymin, xmax, ymax], cloze_s_region)
+        location.update({'left': bbox0[0], 'top': bbox0[1], 'width': bbox0[2] - bbox0[0], 'height': bbox0[3] - bbox0[1]})
+        words_result.append(ele)
+    return words_result
+
+
+def get_total_title_quantity_and_value(box_with_content):
+    list_of_all = []
+
+    for index, ele in enumerate(box_with_content):
+        total_score = []
+        title_number = []
+
+        title_two_number = []
+        title_two_value = []
+        title_two_number_value_list = []
+
+        title_three_number = []
+        title_three_value = []
+
+        above_content = ele['above_content']
+        for words_index, words_str in enumerate(above_content):
+            words = words_str['words']
+
+            pattern1 = re.compile(
+                '^\d+[,、.]?[\u4e00-\u9fa5]?[((]?\d+分+[))]?[((]?\d+[))]?|^\d+[,、.]?[\u4e00-\u9fa5]?[((]?\d+分+[))]?')
+            result1 = re.findall(pattern1, words)
+
+            pattern11 = re.compile('^\d+[,、.]?[\u4e00-\u9fa5]?[((]?\d+[))]?')
+            result11 = re.findall(pattern11, words)
+
+            pattern2 = re.compile('[((]?\d+[))]?')
+            result2 = re.findall(pattern2, words)
+
+            pattern3 = re.compile('[((]?\d?分+[))]?')
+            result3 = re.findall(pattern3, words)
+
+            pattern4 = re.compile(r'①|②|③|④|⑤|⑥|⑦|⑧|⑨|⑩')
+            result4 = re.findall(pattern4, words)
+
+            pattern5 = re.compile('[\u4e00-\u9fa5]')
+            result5 = re.findall(pattern5, words)
+
+            if result1 and title_number == []:
+                title_number_and_value = ''.join(result1)
+                digital_number = re.findall('\d+', title_number_and_value)
+                if len(digital_number) == 1:
+                    title_number.append(int(digital_number[0]))
+                    continue
+                elif len(digital_number) == 2:
+                    title_number.append(int(digital_number[0]))
+                    total_score.append(int(digital_number[1]))
+                    continue
+                elif len(digital_number) == 3:
+                    title_number.append(int(digital_number[0]))
+                    total_score.append(int(digital_number[1]))
+                    title_two_value.append(int(digital_number[2]))
+                    continue
+
+            if result11 and result2 and not result3:
+                digital_number = re.findall('\d+', words)
+                if len(digital_number) == 1:
+                    title_number.append(int(digital_number[0]))
+                else:
+                    title_number.append(int(digital_number[0]))
+                    title_two_number.append(int(digital_number[1]))
+
+            if result2 and not result1 and not result3 and not result11:
+                title_two_number_group = result2[0]
+                digital_number2 = re.search('\d+', title_two_number_group)
+                digital_number22 = digital_number2.group()
+                title_two_number.append(int(digital_number22))
+
+            if result3 and not result1 and not result2 and not result4 and not result5:
+                title_two_value_group = result3[0]
+                digital_value = re.search('\d+', title_two_value_group)
+                if digital_value == None:
+                    value = 2
+                else:
+                    value = digital_value.group()
+                title_two_value.append(int(value))
+
+            if result2 and result4:
+                title_three_value_group = result4[0]
+                title_three_number.append(title_three_value_group)
+            if title_three_number != [] and result3:
+                title_three_value_group = result3[0]
+                digital_value = re.search('\d+', title_three_value_group)
+                value = digital_value.group()
+                title_three_value.append(int(value))
+            if result2 and result3 and not result1:
+                if len(result3) == 1:
+                    title_two_value_group = result3[0]
+                    digital_value = re.search('\d+', title_two_value_group)
+                    if digital_value == None:
+                        value = -1
+                    else:
+                        value = digital_value[0]
+                    title_two_value.append(int(value))
+                else:
+                    title_two_number0 = result2[0]
+                    digital_title_two_number = re.search('\d+', title_two_number0)
+                    digital_title_two_group = digital_title_two_number.group()
+                    # title_two_number.append(digital_title_two_group)
+
+                    value = result3[1]
+                    digital_value = re.search('\d+', value)
+                    if digital_value == None:
+                        value0 = -1
+                    else:
+                        value0 = digital_value.group()
+
+                    # title_two_value.append(int(value0))
+
+                    title_two_number_value_list.append([int(digital_title_two_group), int(value0)])
+
+            if result1 and result2 and result3 and len(title_number) == 1:
+                if len(result3) == 1:
+                    title_two_value_group = result3[0]
+                    digital_value = re.search('\d+', title_two_value_group)
+                    if digital_value == None:
+                        value = -1
+                    else:
+                        value = digital_value[0]
+                    title_two_value.append(int(value))
+                else:
+                    title_two_number0 = result2[0]
+                    digital_title_two_number = re.search('\d+', title_two_number0)
+                    digital_title_two_group = digital_title_two_number.group()
+                    # title_two_number.append(digital_title_two_group)
+
+                    value = result3[1]
+                    digital_value = re.search('\d+', value)
+                    if digital_value == None:
+                        value0 = -1
+                    else:
+                        value0 = digital_value.group()
+
+                    # title_two_value.append(int(value0))
+
+                    title_two_number_value_list.append([int(digital_title_two_group), int(value0)])
+            if result5:
+                continue
+        total_title_quantity = len(title_two_number)
+        title_with_value = dict(zip(title_two_number, title_two_value))
+
+        if title_two_number == [] and title_two_value == [] and title_two_number_value_list != []:
+            total_title_quantity = len(title_two_number_value_list)
+            title_with_value = dict(zip([ele[0] for ele in title_two_number_value_list],
+                                        [ele[1] for ele in title_two_number_value_list]))
+        elif title_two_number == [] and title_two_value != [] and title_two_number_value_list != []:
+            ele_0 = []
+            for element in title_two_number_value_list:
+                ele_0.append(element[0])
+            ele_1 = [ele for ele in range(1, int(len(title_two_number_value_list) + len(title_two_value) + 1))]
+
+            lack_number = list(set(ele_1) - set(ele_0))
+            lack_number_with_value = [lack_number[0], title_two_value[0]]
+            title_two_number_value_list.append(lack_number_with_value)
+
+            title_two_number_value_list = sorted(title_two_number_value_list, key=lambda k: k[0])
+
+            title_with_value = dict(
+                zip([ele[0] for ele in title_two_number_value_list], [ele[1] for ele in title_two_number_value_list]))
+            total_title_quantity = len(title_with_value)
+        elif title_two_number != [] and title_two_value == [] and title_two_number_value_list == []:
+            title_two_value11 = []
+            for i in range(len(title_two_number) + 1):
+                if i:
+                    title_two_value11.append(-1)
+            total_title_quantity = len(title_two_number)
+            title_with_value = dict(zip(title_two_number, title_two_value11))
+        elif title_two_number != [] and title_two_value != [] \
+                and title_two_number_value_list == [] and len(title_two_number) == len(title_two_value):
+            title_with_value = dict(zip(title_two_number, title_two_value))
+            total_title_quantity = len(title_two_number)
+
+        if total_score == [] and title_two_value != []:
+            total_score = int(sum(title_two_value))
+        elif total_score == [] and title_two_value == []:
+            total_score = -1
+        else:
+            total_score = total_score[0]
+
+        if title_two_value == [] and total_score != -1 and title_two_number != []:
+            title_two_value = int(total_score / len(title_two_number))
+        elif title_two_value == [] and total_score == -1 and title_two_number != []:
+            title_two_value0 = []
+            for i in range(len(title_two_number) + 1):
+                if i:
+                    title_two_value0.append(-1)
+            title_with_value = dict(zip(title_two_number, title_two_value0))
+
+        if title_number != [] and total_score != []:
+            title_number = title_number[0]
+            total_score = total_score
+        elif title_number == [] and total_score != []:
+            title_number = -1
+            total_score = total_score
+        elif title_number != [] and total_score == []:
+            title_number = title_number[0]
+            total_score = -1
+
+        if title_number == -1 and title_two_number == [] and total_score == -1 and title_two_value == [] and total_title_quantity == 0:
+            per_title_content = {}
+            per_title_content['class_name'] = ele['class_name']
+            per_title_content['bounding_box'] = ele['bounding_box']
+
+            per_title_content['title_number'] = -1
+            per_title_content['total_title_quantity'] = -1
+            per_title_content['total_score'] = -1
+            per_title_content['title_with_value'] = -1
+
+            list_of_all.append(per_title_content)
+        elif title_number == -1 and title_two_number != [] and total_score == -1 and title_two_value == 0 and total_title_quantity != 0:
+            per_title_content = {}
+
+            per_title_content['class_name'] = ele['class_name']
+            per_title_content['bounding_box'] = ele['bounding_box']
+            per_title_content['title_number'] = -1
+            per_title_content['total_title_quantity'] = total_title_quantity
+            per_title_content['total_score'] = -1
+            per_title_content['title_with_value'] = -1
+
+            list_of_all.append(per_title_content)
+
+        elif total_title_quantity == 0 and title_with_value == []:
+            per_title_content = {}
+
+            per_title_content['class_name'] = ele['class_name']
+            per_title_content['bounding_box'] = ele['bounding_box']
+            per_title_content['title_number'] = title_number
+            per_title_content['total_title_quantity'] = -1
+            per_title_content['total_score'] = total_score
+            per_title_content['title_with_value'] = -1
+            list_of_all.append(per_title_content)
+
+        else:
+            if total_title_quantity == 0:
+                total_title_quantity = -1
+            if title_with_value == {}:
+                title_with_value = -1
+            per_title_content = {}
+            per_title_content['class_name'] = ele['class_name']
+            per_title_content['bounding_box'] = ele['bounding_box']
+            per_title_content['title_number'] = title_number
+            per_title_content['total_score'] = total_score
+            per_title_content['total_title_quantity'] = total_title_quantity
+            per_title_content['title_with_value'] = title_with_value
+            list_of_all.append(per_title_content)
+    return list_of_all
+
+
+def get_sheet_points(sheet_dict_list):
+    region_dict_front = []
+    region_dict_back = []
+
+    for sheet_dict_s in sheet_dict_list:
+        regions = sheet_dict_s['sheet_dict']['regions']
+        class_name_list = [ele['class_name'] for ele in regions]
+        if 'choice_s' in class_name_list or 'choice_m' in class_name_list:
+            region_dict_front.append(sheet_dict_s)
+        else:
+            region_dict_back.append(sheet_dict_s)
+
+    sheet_dict_of_front = []
+    sheet_dict_of_front_without_solve_cloze = []
+    for sheet_dict_front_s in region_dict_front:
+        h, w = sheet_dict_front_s['shape'][0], sheet_dict_front_s['shape'][1]
+        words_result_front = sheet_dict_front_s['ocr']
+        cloze_or_solve_list_front = []
+        if sheet_dict_front_s['sheet_dict']['subject'] != 'english':
+            cloze_or_solve_list_front = [ele for ele in sheet_dict_front_s['sheet_dict']['regions']
+                                         if ele['class_name'] == 'solve0' or ele['class_name'] == 'solve']
+        cloze_s_list = [ele for ele in sheet_dict_front_s['sheet_dict']['regions'] if ele['class_name'] == 'cloze_s']
+        cloze_s_box_with_content = []
+        for ele in cloze_s_list:
+            cloze_s_dict = {}
+            cloze_s_region = utils.crop_region(sheet_dict_list[0]['raw_image'], ele['bounding_box'])
+            cloze_s_res = get_ocr_text_and_coordinate(cloze_s_region, ocr_accuracy=OCR_ACCURACY, language_type='CHN_ENG')
+            if cloze_s_res == []:
+                cloze_s_res0 = cloze_s_res
+            else:
+                cloze_s_res0 = change_box(cloze_s_res, ele['bounding_box'])
+
+            cloze_s_dict['class_name'] = ele['class_name']
+            cloze_s_dict['bounding_box'] = ele['bounding_box']
+            cloze_s_dict['above_content'] = cloze_s_res0
+            cloze_s_box_with_content.append(cloze_s_dict)
+        solve_and_cloze_s_list = cloze_or_solve_list_front + cloze_s_list
+
+        if len(solve_and_cloze_s_list) == 0:
+            sheet_dict_of_front_without_solve_cloze.append(sheet_dict_front_s)
+        else:
+            cloze_or_solve0_box_list_front = split_col(solve_and_cloze_s_list, w)
+
+            box_with_content_front = []
+            for one_col_box in cloze_or_solve0_box_list_front:
+
+                for ele_box in one_col_box:
+                    big_box = {}
+                    words_list = []
+                    for words_res in words_result_front:
+                        xmin = words_res['location']['left']
+                        ymin = words_res['location']['top']
+                        xmax = words_res['location']['left'] + words_res['location']['width']
+                        ymax = words_res['location']['top'] + words_res['location']['height']
+                        words_bbox = [xmin, ymin, xmax, ymax]
+                        if utils.decide_coordinate_contains1(words_bbox,
+                                                             [ele_box['bounding_box']['xmin'],
+                                                              ele_box['bounding_box']['ymin'],
+                                                              ele_box['bounding_box']['xmax'],
+                                                              ele_box['bounding_box']['ymax']]):
+                            words_list.append(words_res)
+                    big_box['class_name'] = ele_box['class_name']
+                    big_box['bounding_box'] = ele_box['bounding_box']
+                    big_box['above_content'] = words_list
+                    box_with_content_front.append(big_box)
+            box_with_content_front1 = box_with_content_front + cloze_s_box_with_content
+            list_of_front = get_total_title_quantity_and_value(box_with_content_front1)
+            sheet_dict_of_front.append(list_of_front)
+
+    sheet_dict_of_back = []
+    sheet_dict_of_back_without_solve_cloze = []
+    for single_sheet_dict in region_dict_back:
+        h1, w1 = single_sheet_dict['shape'][0], single_sheet_dict['shape'][1]
+        words_result_back = single_sheet_dict['ocr']
+        cloze_or_solve0_box_list_back = []
+        if single_sheet_dict['sheet_dict']['subject'] != 'english':
+            cloze_or_solve0_box_list_back = [ele for ele in single_sheet_dict['sheet_dict']['regions']
+                                         if ele['class_name'] == 'solve0'
+                                         or ele['class_name'] == 'solve']
+        cloze_s_list = [ele for ele in single_sheet_dict['sheet_dict']['regions'] if ele['class_name'] == 'cloze_s']
+        cloze_s_box_with_content = []
+        for ele in cloze_s_list:
+            cloze_s_dict = {}
+            cloze_s_region = utils.crop_region(sheet_dict_list[0]['raw_image'], ele['bounding_box'])
+            cloze_s_res = get_ocr_text_and_coordinate(cloze_s_region, ocr_accuracy=OCR_ACCURACY,
+                                                                       language_type='CHN_ENG')
+            if cloze_s_res == []:
+                cloze_s_res0 = cloze_s_res
+            else:
+                cloze_s_res0 = change_box(cloze_s_res, ele['bounding_box'])
+
+            cloze_s_dict['class_name'] = ele['class_name']
+            cloze_s_dict['bounding_box'] = ele['bounding_box']
+            cloze_s_dict['above_content'] = cloze_s_res0
+            cloze_s_box_with_content.append(cloze_s_dict)
+        solve_and_cloze_s_list_back = cloze_or_solve0_box_list_back + cloze_s_list
+
+        if len(solve_and_cloze_s_list_back) == 0:
+            sheet_dict_of_back_without_solve_cloze.append(single_sheet_dict)
+        else:
+            cloze_or_solve0_box_list_back = split_col(solve_and_cloze_s_list_back, w1)
+
+            box_with_content_back = []
+            for one_col_box in cloze_or_solve0_box_list_back:
+                for ele_box in one_col_box:
+                    words_list = []
+                    big_box = {}
+                    for words_res in words_result_back:
+                        xmin = words_res['location']['left']
+                        ymin = words_res['location']['top']
+                        xmax = words_res['location']['left'] + words_res['location']['width']
+                        ymax = words_res['location']['top'] + words_res['location']['height']
+                        words_bbox = [xmin, ymin, xmax, ymax]
+                        if utils.decide_coordinate_contains1(words_bbox,
+                                                             [ele_box['bounding_box']['xmin'],
+                                                              ele_box['bounding_box']['ymin'],
+                                                              ele_box['bounding_box']['xmax'],
+                                                              ele_box['bounding_box']['ymax']]):
+                            words_list.append(words_res)
+                    big_box['class_name'] = ele_box['class_name']
+                    big_box['bounding_box'] = ele_box['bounding_box']
+                    big_box['above_content'] = words_list
+                    box_with_content_back.append(big_box)
+            # print(box_with_content_back)
+
+            box_with_content_back1 = box_with_content_back + cloze_s_box_with_content
+            # box_with_content_back1 = sorted(box_with_content_back1, key=lambda k: k.get('above_content'))
+            list_of_back = get_total_title_quantity_and_value(box_with_content_back1)
+            sheet_dict_of_back.append(list_of_back)
+
+    if sheet_dict_of_front != [] and sheet_dict_of_front_without_solve_cloze == []:
+        sheet_dict_of_front = check_classes(sheet_dict_of_front)
+    else:
+        sheet_dict_of_front = []
+    if sheet_dict_of_back != [] and sheet_dict_of_back_without_solve_cloze == []:
+        sheet_dict_of_back = check_classes(sheet_dict_of_back)
+    else:
+        sheet_dict_of_back = []
+
+    if sheet_dict_of_front != [] and sheet_dict_of_back != []:
+        for single_back in sheet_dict_of_back:
+            if single_back[0]['title_number'] == -1 and single_back[0]['title_with_value'] != -1:
+                if single_back[1]['title_number'] == -1:
+                    title_number = single_back[0]['title_number']
+                    single_back[0].update({'number': title_number})
+                elif single_back[1]['title_number'] != -1:
+                    title_number = int(single_back[1]['title_number']) - 1
+                    single_back[0].update({'number': title_number})
+                else:
+                    continue
+            else:
+                continue
+
+    if sheet_dict_of_front != []:
+        for ele11 in region_dict_front:
+            class_names_front = ele11['sheet_dict']['regions']
+            for single_classes in class_names_front:
+                for ele12 in sheet_dict_of_front:
+                    for ele13 in ele12:
+                        if single_classes['class_name'] == ele13['class_name'] and single_classes['bounding_box'] == \
+                                ele13['bounding_box']:
+                            title_number = ele13['title_number']
+                            total_score = ele13['total_score']
+                            total_title_quantity = ele13['total_title_quantity']
+                            title_with_value = ele13['title_with_value']
+                            single_classes.update({'number': title_number, 'default_points': total_score,
+                                                   'total_title_quantity': total_title_quantity,
+                                                   'title_with_value': title_with_value})
+                        else:
+                            continue
+    else:
+        region_dict_front = sheet_dict_of_front_without_solve_cloze
+
+    if sheet_dict_of_back != []:
+        for ele22 in region_dict_back:
+            class_names_back = ele22['sheet_dict']['regions']
+            for single_class_back in class_names_back:
+                for single_paper in sheet_dict_of_back:       #
+                    for regions_all in single_paper:
+                        if single_class_back['class_name'] == regions_all['class_name'] and \
+                                single_class_back['bounding_box'] == regions_all['bounding_box']:
+                            title_number0 = regions_all['title_number']
+                            total_score0 = regions_all['total_score']
+                            total_title_quantity0 = regions_all['total_title_quantity']
+                            title_with_value0 = regions_all['title_with_value']
+                            single_class_back.update({'number': title_number0,
+                                                      'default_points': total_score0,
+                                                      'total_title_quantity': total_title_quantity0,
+                                                      'title_with_value': title_with_value0})
+                        else:
+                            continue
+    else:
+        region_dict_back = sheet_dict_of_back_without_solve_cloze
+
+    tmp = region_dict_front + region_dict_back
+    if len(sheet_dict_list) != len(tmp):
+        raise ValueError({'答题卡分数识别结果页数丢失: sheet: {}, result: {}'.format(len(sheet_dict_list), len(tmp))})
+    else:
+        sheet_dict_list = tmp
+        # for index, ele in enumerate(sheet_dict_list):
+        #     if sheet_dict_list[index]['sheet_dict']['class_name'] == tmp[index]['sheet_dict']['class_name']:
+        #         ele['sheet_dict'].update({'regions': tmp[index]['sheet_dict']['regions']})
+
+    return sheet_dict_list

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 284 - 0
segment/sheet_resolve/analysis/sheet/sheet_points_total.py


+ 3 - 0
segment/sheet_resolve/analysis/solve/__init__.py

@@ -0,0 +1,3 @@
+# @Author  : lightXu
+# @File    : __init__.py.py
+# @Time    : 2018/11/21 0021 下午 16:02

+ 119 - 0
segment/sheet_resolve/analysis/solve/mark_box.py

@@ -0,0 +1,119 @@
+# @Author  : lightXu
+# @File    : mark_box.py
+# @Time    : 2018/11/21 0021 下午 16:16
+import time
+import re
+import cv2
+import xml.etree.cElementTree as ET
+
+from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate
+from segment.sheet_resolve.tools import utils
+
+
+def solve_mark(left, top, solve_img, xml_path):
+    shape = solve_img.shape
+    y, x = shape[0], shape[1]
+
+    # ocr_region = solve_img[0:int(0.15 * y), :]
+    ocr_region = solve_img[0:250, :]
+
+    # cv2.imshow('ocr_region', ocr_region)
+    # if cv2.waitKey(0) == 27:
+    #     cv2.destroyAllWindows()
+
+    t11 = time.time()
+    word_result_list = get_ocr_text_and_coordinate(ocr_region)
+    t22 = time.time()
+    print('mark ocr time cost: ', t22-t11)
+
+    if len(word_result_list) < 1:
+        return {}
+    else:
+        words_str = str([ele['words'] for ele in word_result_list])
+        number = 999
+        number_model = re.compile("\d+[.、::]\D")
+        number_list = number_model.findall(words_str)
+        if len(number_list) > 0:
+            number = int(re.sub('[\D]', '', number_list[0]))
+
+        all_chars_list = []
+        zhmodel = re.compile(u'[\u4e00-\u9fa5]')
+
+        for i, chars_dict in enumerate(word_result_list):
+            words = re.sub('[iIl|点]', '1', chars_dict['words'])
+            match = zhmodel.search(words)  # 是否有中文
+            if not match:
+                chars_list = chars_dict['chars']
+                all_chars_list = all_chars_list + chars_list
+
+        new_all_chars_list = []
+        i = 1
+
+        while i <= len(all_chars_list):
+            pre_one = all_chars_list[i - 1]
+            if i == len(all_chars_list):
+                new_all_chars_list.append(pre_one)
+                break
+            rear_one = all_chars_list[i]
+            condition1 = abs(pre_one['location']['top'] - rear_one['location']['top']) < pre_one['location'][
+                'height']  # 两字高度差小于一字高度
+            condition2 = pre_one['location']['left'] + 2 * pre_one['location']['width'] > rear_one['location'][
+                'left']  # 两字长度大于两字间间隔
+            if condition1:
+                if condition2:
+                    new_char = pre_one['char'] + rear_one['char']
+                    new_location = {'left': pre_one['location']['left'],
+                                    'top': min(pre_one['location']['top'], rear_one['location']['top']),
+                                    'width': rear_one['location']['left'] + rear_one['location']['width'] -
+                                    pre_one['location']['left'],
+                                    'height': max(pre_one['location']['height'], rear_one['location']['height'])}
+                    new_all_chars_list.append({'char': new_char, 'location': new_location})
+                    i = i + 1 + 1
+                else:
+                    new_all_chars_list.append(pre_one)
+                    i = i + 1
+            else:
+                new_all_chars_list.append(pre_one)  # 遇到字符y轴相差过大就结束
+                break  # break 直接跳行
+
+        tree = ET.parse(xml_path)
+
+        xml_list = []
+        for i, ele in enumerate(new_all_chars_list[1:]):  # 从第二位开始索引
+            pre_one = new_all_chars_list[i]
+            rear_one = ele
+
+            intervel = (rear_one['location']['left'] -
+                        pre_one['location']['left'] -
+                        pre_one['location']['width']) // 2
+
+            xmin = ele['location']['left'] - intervel + left
+            xmax = xmin + ele['location']['width'] + 2 * intervel
+            ymin = ele['location']['top'] - ele['location']['height'] // 2 + top
+            ymax = ymin + 2 * ele['location']['height']
+            tree = utils.create_xml('{}_solve{}'.format(number, ele['char']), tree, xmin, ymin, xmax, ymax)
+            xml_list.append({'char': ele['char'], 'location': [xmin, ymin, xmax, ymax]})
+
+            if i == 0:
+                intervel = (rear_one['location']['left'] -
+                            pre_one['location']['left'] -
+                            pre_one['location']['width']) // 2
+
+                xmin = pre_one['location']['left'] - intervel + left
+                xmax = xmin + pre_one['location']['width'] + 2 * intervel
+                ymin = pre_one['location']['top'] - pre_one['location']['height'] // 2 + top
+                ymax = ymin + 2 * pre_one['location']['height']
+                tree = utils.create_xml('{}_solve{}'.format(number, pre_one['char']), tree, xmin, ymin, xmax, ymax)
+                xml_list.insert(0, {'char': pre_one['char'], 'location': [xmin, ymin, xmax, ymax]})
+
+        tree.write(xml_path)
+        return {'number': number, 'mark': xml_list}
+
+
+if __name__ == '__main__':
+    path = r'C:\Users\Administrator\Desktop\sheet\cloze01.jpg'
+    image = cv2.imread(path)
+    t1 = time.time()
+    # solve_mark(image, path)
+    t2 = time.time()
+    print(t2 - t1)

+ 32 - 0
segment/sheet_resolve/analysis/solve/mark_line_box.py

@@ -0,0 +1,32 @@
+# @Author  : lightXu
+# @File    : mark_box.py
+# @Time    : 2018/11/21 0021 下午 16:16
+import time
+import re
+import cv2
+import xml.etree.cElementTree as ET
+
+from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate
+from segment.sheet_resolve.tools import utils
+
+
+def solve_line(solve_img):
+    # ocr_region = solve_img[0:int(0.15 * y), :]
+    ocr_region = solve_img[0:250, :]
+
+    t11 = time.time()
+    word_result_list = get_ocr_text_and_coordinate(ocr_region)
+    t22 = time.time()
+    print('mark ocr time cost: ', t22-t11)
+
+    if len(word_result_list) < 1:
+        return 999
+    else:
+        words_str = str([ele['words'] for ele in word_result_list])
+        number = 999
+        number_model = re.compile("\d+[.、::]\D")
+        number_list = number_model.findall(words_str)
+        if len(number_list) > 0:
+            number = int(re.sub('[\D]', '', number_list[0]))
+
+        return number

+ 118 - 0
segment/sheet_resolve/analysis/solve/optional_solve.py

@@ -0,0 +1,118 @@
+# @Author  : lightXu
+# @File    : optional_solve.py
+# @Time    : 2019/9/17 0017 下午 13:18
+import cv2
+import re
+from segment.sheet_resolve.tools.brain_api import get_ocr_text_and_coordinate
+
+
+def rgb2binary(im):
+    gray_img = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
+    _ret, thresh_img = cv2.threshold(gray_img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    return thresh_img
+
+
+def find_contours(left, top, image, ex_x=30, ex_y=1):
+    threshed = rgb2binary(image)
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (ex_x, ex_y))  # 膨胀系数
+    # morphed = cv2.morphologyEx(threshed, cv2.MORPH_CLOSE, kernel)
+    morphed = cv2.dilate(threshed, kernel, iterations=1)
+
+    (major, minor, _) = cv2.__version__.split(".")
+    contours = cv2.findContours(morphed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cnts = contours[0] if int(major) > 3 else contours[1]
+
+    cnt = sorted(cnts, key=cv2.contourArea)
+    l, t, r, b = 9999, 9999, 0, 0
+    sum_w, sum_h = 0, 0
+    for ele in cnt:
+        x, y, w, h = cv2.boundingRect(ele)
+        xm = x + w
+        ym = y + h
+        l, t, r, b = min(l, x), min(t, y), max(r, xm), max(b, ym)
+        sum_w, sum_h = sum_w + w, sum_h + h
+
+    cols = len(cnt)
+    if cols > 4:
+        cols = 4
+    single_width, single_height = int(sum_w / len(cnt)), int(sum_h / len(cnt))
+    optional_solve_dict = {'rows': 1, 'cols': cols,
+                           'single_width': single_width,
+                           'single_height': single_height,
+                           'bounding_box': {'xmin': l + left + single_width,
+                                            'ymin': t + top,
+                                            'xmax': r + left,
+                                            'ymax': b + top}
+                           }
+    return optional_solve_dict
+
+
+def resolve_optional_choice(ll, tt, direction, image):
+    ocr_res = get_ocr_text_and_coordinate(image)
+    # ocr_res = [{'chars': [{'char': '[', 'location': {'width': 16, 'top': 12, 'left': 11, 'height': 32}}, {'char': '4', 'location': {'width': 16, 'top': 12, 'left': 27, 'height': 32}}, {'char': '5', 'location': {'width': 16, 'top': 12, 'left': 36, 'height': 32}}, {'char': ']', 'location': {'width': 16, 'top': 12, 'left': 55, 'height': 32}}, {'char': '[', 'location': {'width': 16, 'top': 12, 'left': 74, 'height': 32}}, {'char': '4', 'location': {'width': 16, 'top': 12, 'left': 93, 'height': 32}}, {'char': '6', 'location': {'width': 16, 'top': 12, 'left': 102, 'height': 32}}, {'char': ']', 'location': {'width': 16, 'top': 12, 'left': 121, 'height': 32}}, {'char': '[', 'location': {'width': 16, 'top': 12, 'left': 140, 'height': 32}}, {'char': '4', 'location': {'width': 16, 'top': 12, 'left': 159, 'height': 32}}, {'char': '7', 'location': {'width': 16, 'top': 12, 'left': 178, 'height': 32}}, {'char': ']', 'location': {'width': 14, 'top': 12, 'left': 188, 'height': 32}}], 'location': {'width': 191, 'top': 12, 'left': 11, 'height': 32}, 'words': '[45][46][47]'}]
+    digital_p = r'[\[*|【*]\d+[\]*|]*]'
+    eng_char_p = '[[*|【*][A|B|C|D|E|F|G|T|F][]*|】*]'  # english
+
+    pattern_list = [digital_p, eng_char_p]
+
+    option_list = []
+    mean_width_list = []
+    mean_height_list = []
+    for i, words_line in enumerate(ocr_res):
+        words = words_line['words']
+        words = words.replace(' ', '').upper()  # 去除空格
+        loc = words_line['location']
+        top = int(loc['top'])
+        left = int(loc['left'])
+        width = int(loc['width'])
+        height = int(loc['height'])
+        loc.update({'right': left + width, 'bottom': top + height,
+                    'mid_x': left + width // 2, 'mid_y': top + height // 2})
+
+        for p in pattern_list:
+            words_m = re.finditer(p, words)
+            match_index_list = [(m.group(), m.span()) for m in words_m if m]
+
+            option_list += [ele[0].replace('[', '').replace(']', '').replace('【', ']').replace('】', '')
+                            for ele in match_index_list]
+
+            for letter_info in match_index_list:
+                index_start = letter_info[1][0]
+                index_end = letter_info[1][1] - 1
+                char_start = words_line['chars'][index_start]
+                char_end = words_line['chars'][index_end]
+
+                letter_loc_xmin = int(char_start['location']['left'])
+                letter_loc_ymin = min(int(char_start['location']['top']), int(char_end['location']['top']))
+                letter_loc_xmax = int(char_end['location']['left']) + int(char_end['location']['width'])
+                letter_loc_ymax = max(int(char_start['location']['top']) + int(char_start['location']['height']),
+                                      int(char_end['location']['top']) + int(char_end['location']['height']))
+
+                mean_width_list.append(letter_loc_xmax-letter_loc_xmin)
+                mean_height_list.append(letter_loc_ymax-letter_loc_ymin)
+
+    if not option_list:
+        option_list = 'A,B'
+    left = min([int(ele['location']['left']) for ele in ocr_res])
+    top = min([int(ele['location']['top']) for ele in ocr_res])
+    right = max([int(ele['location']['left']) + int(ele['location']['width']) for ele in ocr_res])
+    bottom = max([int(ele['location']['top']) + int(ele['location']['height']) for ele in ocr_res])
+
+    if direction == 180:
+        rows, cols = 1, len(option_list)
+    else:
+        rows, cols = len(option_list), 1
+
+    mean_width = sum(mean_width_list) // len(mean_width_list)
+    mean_height = sum(mean_height_list) // len(mean_height_list)
+    optional_choice_dict = {'rows': rows, 'cols': cols,
+                            'option': ','.join(option_list),
+                            'single_width': mean_width,
+                            'single_height': mean_height,
+                            'direction': direction,
+                            'bounding_box': {'xmin': ll + left,
+                                             'ymin': tt + top,
+                                             'xmax': ll + right,
+                                             'ymax': tt + bottom}}
+
+    return optional_choice_dict

+ 14 - 0
segment/sheet_resolve/labels/000000-template.xml

@@ -0,0 +1,14 @@
+<annotation>
+	<folder>JPEGImage</folder>
+	<filename>000001.jpg</filename>
+	<path>00</path>
+	<source>
+		<database>Unknown</database>
+	</source>
+	<size>
+		<width>1000</width>
+		<height>1000</height>
+		<depth>3</depth>
+	</size>
+	<segmented>0</segmented>
+</annotation>

+ 2 - 0
segment/sheet_resolve/tools/__init__.py

@@ -0,0 +1,2 @@
+# @Author  : lightXu
+# @File    : __init__.py.py

+ 382 - 0
segment/sheet_resolve/tools/brain_api.py

@@ -0,0 +1,382 @@
+# @Author  : lightXu
+# @File    : brain_api.py
+# @Time    : 2018/11/21 0021 下午 16:20
+import shutil
+import requests
+import base64
+from urllib import parse, request
+import cv2
+import time
+import numpy as np
+
+import pytesseract
+from segment.server import ocr_login
+from segment.sheet_resolve.tools import utils
+import xml.etree.cElementTree as ET
+
+# access_token = '24.82b09618f94abe2a35113177f4eec593.2592000.1546765941.282335-14614857'
+access_token = ocr_login()
+OCR_BOX_URL = 'https://aip.baidubce.com/rest/2.0/ocr/v1/'
+OCR_URL = 'https://aip.baidubce.com/rest/2.0/ocr/v1/'
+OCR_HAND_URL = 'https://aip.baidubce.com/rest/2.0/ocr/v1/handwriting'
+# OCR_ACCURACY = 'general'
+OCR_ACCURACY = 'accurate'
+OCR_CLIENT_ID = 'AVH7VGKG8QxoSotp6wG9LyZq'
+OCR_CLIENT_SECRET = 'gG7VYvBWLU8Rusnin8cS8Ta4dOckGFl6'
+OCR_TOKEN_UPDATE_DATE = 10
+
+
+def preprocess(img):
+    scale = 0
+    dilate = 1
+    blur = 3
+
+    # rescale the image
+    if scale != 0:
+        img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+
+    # Convert to gray
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    # Apply dilation and erosion to remove some noise
+    if dilate != 0:
+        kernel = np.ones((dilate, dilate), np.uint8)
+        img = cv2.dilate(img, kernel, iterations=1)
+        img = cv2.erode(img, kernel, iterations=1)
+
+    # Apply blur to smooth out the edges
+    if blur != 0:
+        img = cv2.GaussianBlur(img, (blur, blur), 0)
+
+    # Apply threshold to get image with only b&w (binarization)
+    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
+
+    return img
+
+
+def opecv2base64(img):
+    image = cv2.imencode('.jpg', img)[1]
+    base64_data = str(base64.b64encode(image))[2:-1]
+    return base64_data
+
+
+def get_ocr_raw_result(img, ocr_accuracy=OCR_ACCURACY, language_type='CHN_ENG'):
+    textmod = {'access_token': access_token}
+    textmod = parse.urlencode(textmod)
+    url = '{}{}{}{}'.format(OCR_BOX_URL, ocr_accuracy, '?', textmod)
+    url_general = '{}{}{}{}'.format(OCR_BOX_URL, 'general', '?', textmod)
+
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    image_type = 'base64'
+    group_id = 'group001'
+    user_id = 'usr001'
+
+    image = opecv2base64(img)
+
+    data = {
+        'image_type': image_type,
+        'group_id': group_id,
+        'user_id': user_id,
+        'image': image,
+        'detect_direction': 'true',
+        'recognize_granularity': 'small',
+        'language_type': language_type,
+        # 'vertexes_location': 'true',
+        # 'probability': 'true'
+    }
+
+    resp = requests.post(url, data=data, headers=headers, timeout=15).json()
+    if resp.get('error_msg'):
+        if 'internal error' in resp.get('error_msg'):
+            resp = requests.post(url_general, data=data, headers=headers).json()
+            if resp.get('error_msg'):
+                raise Exception("ocr {}!".format(resp.get('error_msg')))
+        else:
+            raise Exception("ocr {}!".format(resp.get('error_msg')))
+
+    return resp
+
+
+def get_ocr_text_and_coordinate(img, ocr_accuracy=OCR_ACCURACY, language_type='CHN_ENG'):
+    textmod = {'access_token': access_token}
+    textmod = parse.urlencode(textmod)
+    url = '{}{}{}{}'.format(OCR_BOX_URL, ocr_accuracy, '?', textmod)
+    url_general = '{}{}{}{}'.format(OCR_BOX_URL, 'general', '?', textmod)
+
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    image_type = 'base64'
+    group_id = 'group001'
+    user_id = 'usr001'
+
+    image = opecv2base64(img)
+
+    data = {
+        'image_type': image_type,
+        'group_id': group_id,
+        'user_id': user_id,
+        'image': image,
+        # 'detect_direction': 'true',
+        'recognize_granularity': 'small',
+        'language_type': language_type,
+        # 'vertexes_location': 'true',
+        # 'probability': 'true'
+    }
+
+    # resp = requests.post(url, data=data, headers=headers, timeout=15).json()
+    resp = requests.post(url, data=data, headers=headers).json()
+    if resp.get('error_msg'):
+        if 'internal error' in resp.get('error_msg'):
+            resp = requests.post(url_general, data=data, headers=headers).json()
+            if resp.get('error_msg'):
+                raise Exception("ocr {}!".format(resp.get('error_msg')))
+        else:
+            raise Exception("ocr {}!".format(resp.get('error_msg')))
+
+    words_result = resp.get('words_result')
+    return words_result
+
+
+def get_ocr_text_and_coordinate0(img, ocr_accuracy=OCR_ACCURACY, language_type='CHN_ENG'):
+    textmod = {'access_token': access_token}
+    textmod = parse.urlencode(textmod)
+    url = '{}{}{}{}'.format(OCR_BOX_URL, ocr_accuracy, '?', textmod)
+    url_general = '{}{}{}{}'.format(OCR_BOX_URL, 'general', '?', textmod)
+
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    image_type = 'base64'
+    group_id = 'group001'
+    user_id = 'usr001'
+
+    image = opecv2base64(img)
+
+    data = {
+        'image_type': image_type,
+        'group_id': group_id,
+        'user_id': user_id,
+        'image': image,
+        'detect_direction': 'false',
+        'recognize_granularity': 'small',
+        'language_type': language_type,
+        # 'vertexes_location': 'true',
+        # 'probability': 'true'
+    }
+
+    # resp = requests.post(url, data=data, headers=headers, timeout=15).json()
+    resp = requests.post(url, data=data, headers=headers).json()
+    if resp.get('error_msg'):
+        if 'internal error' in resp.get('error_msg'):
+            resp = requests.post(url_general, data=data, headers=headers).json()
+            if resp.get('error_msg'):
+                raise Exception("ocr {}!".format(resp.get('error_msg')))
+        else:
+            raise Exception("ocr {}!".format(resp.get('error_msg')))
+
+    words_result = resp.get('words_result')
+    return words_result
+
+
+def get_ocr_text_and_coordinate_direction(img, ocr_accuracy=OCR_ACCURACY, language_type='CHN_ENG'):
+    textmod = {'access_token': access_token}
+    textmod = parse.urlencode(textmod)
+    url = '{}{}{}{}'.format(OCR_BOX_URL, ocr_accuracy, '?', textmod)
+    url_general = '{}{}{}{}'.format(OCR_BOX_URL, 'general', '?', textmod)
+
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    image_type = 'base64'
+    group_id = 'group001'
+    user_id = 'usr001'
+
+    image = opecv2base64(img)
+
+    data = {
+        'image_type': image_type,
+        'group_id': group_id,
+        'user_id': user_id,
+        'image': image,
+        'detect_direction': 'true',
+        'recognize_granularity': 'small',
+        'language_type': language_type,
+        # 'vertexes_location': 'true',
+        # 'probability': 'true'
+    }
+
+    resp = requests.post(url, data=data, headers=headers, timeout=15).json()
+    if resp.get('error_msg'):
+        if 'internal error' in resp.get('error_msg'):
+            resp = requests.post(url_general, data=data, headers=headers).json()
+            if resp.get('error_msg'):
+                raise Exception("ocr {}!".format(resp.get('error_msg')))
+        else:
+            raise Exception("ocr {}!".format(resp.get('error_msg')))
+
+    words_result = resp.get('words_result')
+    direction = resp.get('direction')
+    # d_map = {0: 180,
+    #          - 1: 90,
+    #          - 2: -180,
+    #          - 3: -270}
+    d_map = {0: 180,
+             -1: 90,
+             -2: 180,
+             -3: 90}
+    return words_result, d_map[direction]
+
+
+def get_ocr_text_and_coordinate_in_google_format(img, ocr_accuracy=OCR_ACCURACY, language_type='CHN_ENG'):
+    textmod = {'access_token': access_token}
+    textmod = parse.urlencode(textmod)
+    url = '{}{}{}{}'.format(OCR_BOX_URL, ocr_accuracy, '?', textmod)
+    url_general = '{}{}{}{}'.format(OCR_BOX_URL, 'general', '?', textmod)
+
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    image_type = 'base64'
+    group_id = 'group001'
+    user_id = 'usr001'
+
+    image = opecv2base64(img)
+
+    data = {
+        'image_type': image_type,
+        'group_id': group_id,
+        'user_id': user_id,
+        'image': image,
+        'detect_direction': 'true',
+        'recognize_granularity': 'small',
+        'language_type': language_type,
+        # 'vertexes_location': 'true',
+        # 'probability': 'true'
+    }
+
+    resp = requests.post(url, data=data, headers=headers).json()
+    if resp.get('error_msg'):
+        if 'internal error' in resp.get('error_msg'):
+            resp = requests.post(url_general, data=data, headers=headers).json()
+            if resp.get('error_msg'):
+                raise Exception("ocr {}!".format(resp.get('error_msg')))
+        else:
+            raise Exception("ocr {}!".format(resp.get('error_msg')))
+
+    words_result = resp.get('words_result')
+    dict_list = [item2.get('location') for item in words_result for item2 in item['chars']]
+    char_list = [item2.get('char') for item in words_result for item2 in item['chars']]
+    words = [item.get('words') for item in words_result]
+    matrix = []
+    for adict in dict_list:
+        xmin = adict['left']
+        ymin = adict['top']
+        xmax = adict['width'] + adict['left']
+        ymax = adict['top'] + adict['height']
+        item0 = (xmin, ymin, xmax, ymax)
+        matrix.append(item0)
+
+    res_dict = {'chars': char_list, 'coordinates': matrix, 'words': words}
+    return res_dict
+
+
+def change_format_baidu_to_google(words_result):
+    dict_list = [item2.get('location') for item in words_result for item2 in item['chars']]
+    char_list = [item2.get('char') for item in words_result for item2 in item['chars']]
+    words = [item.get('words') for item in words_result]
+    matrix = []
+    for adict in dict_list:
+        xmin = adict['left']
+        ymin = adict['top']
+        xmax = adict['width'] + adict['left']
+        ymax = adict['top'] + adict['height']
+        item0 = (xmin, ymin, xmax, ymax)
+        matrix.append(item0)
+
+    res_dict = {'chars': char_list, 'coordinates': matrix, 'words': words}
+    return res_dict
+
+
+def get_handwriting_ocr_text_and_coordinate_in_google_format(img, words_type='words'):
+    textmod = {'access_token': access_token}
+    textmod = parse.urlencode(textmod)
+    url = '{}{}{}'.format(OCR_HAND_URL, '?', textmod)
+
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    image = opecv2base64(img)
+
+    data = {
+        'image': image,
+        'recognize_granularity': 'small',
+        'words_type': words_type,
+    }
+
+    resp = requests.post(url, data=data, headers=headers).json()
+    if resp.get('error_msg'):
+        raise Exception("ocr {}!".format(resp.get('error_msg')))
+
+    words_result = resp.get('words_result')
+    dict_list = [item2.get('location') for item in words_result for item2 in item['chars']]
+    char_list = [item2.get('char') for item in words_result for item2 in item['chars']]
+    words = [item.get('words') for item in words_result]
+    matrix = []
+    for adict in dict_list:
+        xmin = adict['left']
+        ymin = adict['top']
+        xmax = adict['width'] + adict['left']
+        ymax = adict['top'] + adict['height']
+        item0 = (xmin, ymin, xmax, ymax)
+        matrix.append(item0)
+
+    res_dict = {'chars': char_list, 'coordinates': matrix, 'words': words}
+    return res_dict
+
+
+def tesseract_boxes_by_py(image, ocr_lang='chi_sim+eng'):
+    img = preprocess(image)
+    txt = pytesseract.image_to_boxes(img, lang=ocr_lang, output_type='dict')
+    h, w = img.shape
+    char_list = txt['char']
+
+    left = txt['left']
+    bottom = [(h - top) for top in txt['top']]
+    right = txt['right']
+    top = [(h - bottom) for bottom in txt['bottom']]
+
+    matrix = []
+    for i, ele in enumerate(left):
+        matrix.append((ele, top[i], right[i], bottom[i]))
+
+    res_dict = {'chars': char_list, 'coordinates': matrix}
+    return res_dict
+
+
+def gen_xml_of_per_char(img_path):
+    img = utils.read_single_img(img_path)
+    res_dict = get_ocr_text_and_coordinate_in_google_format(img, 'accurate', 'CHN_ENG')
+    box_list = res_dict['coordinates']
+    tree = ET.parse(r'./000000-template.xml')  # xml tree
+    for index_num, exam_bbox in enumerate(box_list):
+        tree = utils.create_xml('{}'.format(res_dict['chars'][index_num]), tree,
+                                exam_bbox[0], exam_bbox[1], exam_bbox[2], exam_bbox[3])
+    # print(exam_items_bbox)
+    tree.write(img_path.replace('.jpg', '.xml'))
+
+    res_dict_google = tesseract_boxes_by_py(img, ocr_lang='chi_sim+equ+eng')
+    box_list_g = res_dict_google['coordinates']
+    tree_g = ET.parse(r'./000000-template.xml')  # xml tree
+    for index_num, exam_bbox in enumerate(box_list_g):
+        tree_g = utils.create_xml('{}'.format(res_dict_google['chars'][index_num]), tree_g,
+                                  exam_bbox[0], exam_bbox[1], exam_bbox[2], exam_bbox[3])
+    # print(exam_items_bbox)
+    tree_g.write(img_path.replace('.jpg', '_g.xml'))
+    shutil.copy(img_path, img_path.replace('.jpg', '_g.jpg'))
+
+
+if __name__ == '__main__':
+    img_path0 = r'C:\Users\Administrator\Desktop\sheet\mark-test\002_mark.jpg'
+    image0 = cv2.imread(img_path0)
+    t1 = time.time()
+    res = get_ocr_text_and_coordinate(image0)
+    t2 = time.time()
+    print(t2 - t1)
+    print(res)

برخی فایل ها در این مقایسه diff نمایش داده نمی شوند زیرا تعداد فایل ها بسیار زیاد است