123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- import os
- import time
- import json
- import logging
- import pymongo
- # 建立mongodb连接
- # myclient = pymongo.MongoClient("mongodb://192.168.1.140:27017/")
- myclient = pymongo.MongoClient(
- # host="49.232.97.180",
- host="10.19.1.2",
- port=8888,
- username="root",
- password="oyiqd!oy@wxc=ykw@2*jei!")
- mongo_info_db = myclient["math_tk"]
- mongo_coll_cloud = mongo_info_db['cloud_topic']
- mongo_coll_school = mongo_info_db['school_topic']
- # mongodb题库名称
- mongo_coll_list = [mongo_coll_cloud, mongo_coll_school]
- # 建立mysql连接
- # mysqldb = {
- # "host": '192.168.1.232',
- # "user": 'mysql_dev',
- # "password": 'mydb_3307',
- # "db": 'zsy_tk',
- # "charset": 'utf8mb4'
- # }
- mysqldb = {
- # "host": '10.19.63.195',
- "host": '172.16.5.14',
- "user": 'zsy',
- "password": 'Hbt3sZNxepnZQNPU',
- "db": 'zsy_tk',
- "charset": 'utf8mb4'
- }
- # 图片公式识别链接
- # formula_url = r"http://192.168.1.208:8000/segment/formula/"
- formula_url = r"http://10.19.1.11:7080/segment/formula/"
- # mongodb句向量训练标志
- sent_train_flag = 1
- # 批量处理数据字典格式
- batch_processing_dict = {
- "topic_id_list": [],
- "cont_clear_list": [],
- "cont_cut_list": [],
- "cut_idx_list": [0]
- }
- # 云题库和校本题库阈值
- database_threshold = [[0.94, 0.8], [0.92, 0.72]]
- # hnsw模型数量
- hnsw_num = 2
- # 词向量维度
- vector_dim = 384
- # hnsw评价指标('l2','cosine','ip')
- hnsw_metric = ('l2','cosine','ip')[0]
- # hnsw最大索引数量
- num_elements = 5000000
- # hnsw召回数量参数
- hnsw_set_ef = 150
- # hnsw模型更新和检索链接
- hnsw_update_url = r"http://localhost:8858/update"
- hnsw_retrieve_url = r"http://localhost:8858/retrieve"
- # 根地址
- root_path = os.getcwd()
- data_root_path = os.path.join(root_path, "model_data")
- # Sentence_BERT模型地址
- sbert_path = os.path.join(data_root_path, "all-MiniLM-L6-v2")
- # 获取hnsw模型地址
- hnsw_path_list = ["hnsw_cloud.bin", "hnsw_school.bin"]
- # hnsw待更新保存数据文件地址
- hnsw_update_save_path = os.path.join(root_path, "hnsw_update_data.txt")
- # 日志地址
- log_root_path = os.path.join(root_path, "logs")
- # 查重日志地址
- math_dup_path = os.path.join(log_root_path, "math_dup_app.log")
- # 关联和自查重日志地址
- rlt_ddc_path = os.path.join(log_root_path, "rlt_ddc_app.log")
- # 日志信息(message)格式
- log_msg = "id : {id} -> {type} -> {message}"
- # 封装logging
- class LogConfig():
- def __init__(self, log_path, logger_name):
- '''
- 指定保存日志的文件路径,日志级别,以及调用文件
- 将日志存入到指定的文件中
- '''
- # 创建logger对象
- self.logger = logging.getLogger(logger_name)
- self.log_path = log_path
- # 设置日志等级
- self.logger.setLevel(logging.INFO) # DEBUG
- # 追加写入文件a ,设置utf-8编码防止中文写入乱码
- fh = logging.FileHandler(self.log_path, mode='a', encoding='utf8', delay=True)
- # 向文件输出的日志级别
- fh.setLevel(logging.INFO)
- # 向文件输出的日志信息格式
- formatter_dict = {
- "sys-msg": "%(asctime)s-%(filename)s-%(lineno)s-%(levelname)s",
- "log-msg": "%(message)s",
- }
- formatter = logging.Formatter(json.dumps(formatter_dict))
- # 将日志信息格式加载到日志文件中
- fh.setFormatter(formatter)
- # 加载文件到logger对象中
- self.logger.addHandler(fh)
- # 重置刷新日志
- def log_reset(self):
- log_reset_path = (math_dup_path, rlt_ddc_path)
- for log_idx, lr_path in enumerate(log_reset_path):
- if os.path.exists(lr_path) is False:
- continue
- # 设置日志定长自动新建
- logsize = os.path.getsize(lr_path)
- if log_idx > 0 and logsize <= 100*1024*1024: # 100M
- continue
- os.rename(lr_path, lr_path.split('.')[0]+'_'+\
- str(time.strftime('%Y_%m%d_%H%M', time.localtime()))+'.log')
-
- # 获取生成日志读写操作并删除多余日志
- def get_log(self):
- self.del_log()
- return self.logger
- # 删除长期未处理日志
- def del_log(self):
- file_list = os.listdir(log_root_path)
- file_path_list = [os.path.join(log_root_path, file) for file in file_list]
- file_path_list = [file_path for file_path in file_path_list
- if self.log_path.split('.')[0] in file_path.split('.')[0]
- and self.log_path != file_path]
- file_path_list = [(file_path, os.path.getctime(file_path))
- for file_path in file_path_list]
- # 根据创建时间排序获取需要删除文件路径名称
- file_sort_list = sorted(file_path_list, key=lambda x: x[1], reverse=True)[3:]
- if len(file_sort_list) > 0:
- for ele in file_sort_list:
- os.remove(ele[0])
|