import os import time import shutil import sqlite3 from config import sqlite_path, sqlite_copy_path, mongo_coll from word_segment import Word_Segment # 建立倒排索引 class Inverted_Index_Establish(): def __init__(self, n_grams_flag=False): # 获取mongodb数据 self.mongo_coll = mongo_coll # 获取数据库地址 self.sqlite_path = sqlite_path self.sqlite_copy_path = sqlite_copy_path # 分词算法 self.word_seg = Word_Segment(n_grams_flag=n_grams_flag) def __call__(self): # 从mongodb读取数据集 origin_dataset = self.mongo_coll.find(no_cursor_timeout=True, batch_size=5) # 将数据构建倒排索引 inverted_index = self.inverted_index_compute(origin_dataset) # 将倒排索引数据写入sqlite self.sqlite_write(inverted_index) # 备份sqlite数据库,防止出现损坏 shutil.copy(self.sqlite_path, self.sqlite_copy_path) # 读取数据构建倒排索引 def inverted_index_compute(self, origin_dataset): start = time.time() # 倒排索引词典 inverted_index = dict() # 计算文档总长度,用于计算平均长度 all_doc_length = 0 for i,data in enumerate(origin_dataset): if "content_clear" not in data: continue seg_list, _ = self.word_seg(data["content_clear"]) # 计算每篇文档长度和总文档长度 doc_length = len(seg_list) all_doc_length += doc_length # 存储每个文档中每个词的出现次数 tf_dict = dict() for word in seg_list: tf_dict[word] = tf_dict.get(word, 0) + 1 # 将每个term对应的文档按固定格式存入倒排索引词典 for term,tf in tf_dict.items(): # 文档间通过"\n"分隔,文档内通过"\t"分隔(term_freq, doc_length) doc_info = "{}\t{}".format(tf, doc_length) if term not in inverted_index: inverted_index[term] = [str(data["id"]), doc_info] else: inverted_index[term][0] += "\n" + str(data["id"]) inverted_index[term][1] += "\n" + doc_info # 获取文档总数与文档总长度存入倒排索引词典 inverted_index["doc_data_statistics"] = [i + 1, str(all_doc_length)] print(time.time()-start) print("倒排索引计算完毕") return inverted_index # 将倒排索引数据写入sqlite def sqlite_write(self, inverted_index): # 若已存在sqlite数据库,则删除数据库重建 if os.path.exists(self.sqlite_path): os.remove(self.sqlite_path) # 建立sqlite数据库链接 sqlite_conn = sqlite3.connect(self.sqlite_path) # 创建游标对象cursor cursor = sqlite_conn.cursor() # 创建数据表 cursor.execute("DROP TABLE IF EXISTS physics") cursor.execute("CREATE TABLE physics (term TEXT PRIMARY KEY, doc_freq TEXT, docs TEXT)") for key, value in inverted_index.items(): t = (key, value[0], value[1]) cursor.execute("INSERT INTO physics VALUES (?, ?, ?)", t) # 提交事务 sqlite_conn.commit() # 关闭数据库连接 cursor.close() sqlite_conn.close() if __name__ == "__main__": # 初始化倒排索引建立(n-grams为True) sql_etl = Inverted_Index_Establish(True) sql_etl()